In [5]:
import os
import nltk
import re
from nbformat import read, NO_CONVERT
import pandas as pd
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
import json

In [6]:
def extract_code_from_notebooks(directory):
    """
    Extracts code from all Jupyter Notebook files in the given directory.

    Args:
    - directory (str): The path to the directory containing the notebooks.

    Returns:
    - List of strings, each containing concatenated code from a notebook.
    """
    code_files_content = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.ipynb'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    try:
                        notebook = read(f, NO_CONVERT)
                        code_cells = [cell['source'] for cell in notebook['cells'] if cell['cell_type'] == 'code']
                        code_files_content.append('\n'.join(code_cells))
                    except Exception as e:
                        print(f"Error processing {file_path}: {e}")
    return code_files_content

def extract_code(text):
    """
    Extracts code lines from a given text.

    Args:
    - text (str): The text to extract code from.

    Returns:
    - str: A single string containing all code lines.
    """
    code = []
    for line in text.split('\n'):
        line = line.strip()
        code.append(line)
    return ' '.join(code)

def process_notebooks_to_dataframe(directory):
    """
    Processes Jupyter Notebooks in the specified directory into a DataFrame.

    Args:
    - directory (str): The path to the directory containing the notebooks.

    Returns:
    - DataFrame: A pandas DataFrame containing the tokenized code from notebooks.
    """
    # Extract code from notebooks
    code_files_content = extract_code_from_notebooks(directory)

    # Process each code file
    processed_data = []
    for code in code_files_content:
        code_only = extract_code(code)
        tokenized_code = word_tokenize(code_only)
        # Flatten the list of words into individual rows
        for word in tokenized_code:
            processed_data.append({"word": word})

    # Convert the processed data to a DataFrame
    df = pd.DataFrame(processed_data)
    return df

def save_dataframe_to_csv(df, filename):
    """
    Saves the DataFrame to a CSV file.

    Args:
    - df (DataFrame): The DataFrame to save.
    - filename (str): The path to the file where the DataFrame will be saved.
    """
    df.to_csv(filename, index=False)

def fit_and_save_tokenizer(df, tokenizer_path):
    """
    Fits a tokenizer on the DataFrame's 'code' column and saves it to a file.

    Args:
    - df (DataFrame): The DataFrame containing the tokenized code.
    - tokenizer_path (str): The path to the file where the tokenizer will be saved.
    """
    # Fit the tokenizer on the 'code' column
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df['word'].apply(lambda x: ' '.join(x)).values)

    # Save the tokenizer
    with open(tokenizer_path, 'w') as f:
        json.dump(tokenizer.to_json(), f)
    print(f"Tokenizer saved to {tokenizer_path}")

In [7]:
# Path to your code directory
code_directory = 'C:\\Users\\Lardex\\Desktop\\DU Assginments\\AI_BOOTCAMP\\DU-VIRT-AI-PT-10-2023-U-LOLC'

# Process notebooks and save to CSV
df = process_notebooks_to_dataframe(code_directory)
save_dataframe_to_csv(df, 'processed_code.csv')

Error processing C:\Users\Lardex\Desktop\DU Assginments\AI_BOOTCAMP\DU-VIRT-AI-PT-10-2023-U-LOLC\01-Lesson-Plans\20-NLP\1\Activities\03-Ins_Stopwords\Solved\stopwords_solution1.ipynb: Notebook does not appear to be JSON: ''
Error processing C:\Users\Lardex\Desktop\DU Assginments\AI_BOOTCAMP\DU-VIRT-AI-PT-10-2023-U-LOLC\01-Lesson-Plans\21-Transformers\3\Activities\03-Ins_Gradio_Text_Summarization\blocks_gradio.ipynb: Notebook does not appear to be JSON: ''


In [8]:
# Fit and save the tokenizer
fit_and_save_tokenizer(df, 'tokenizer.json')

Tokenizer saved to tokenizer.json
