In [1]:
import os
import pandas as pd
import spacy
from nbformat import read, NO_CONVERT
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from random import randint
from pickle import dump, load
from keras_preprocessing.sequence import pad_sequences
import numpy as np

# Load SpaCy large English model and disable unnecessary components
nlp = spacy.load('en_core_web_lg', disable=["tagger", "ner", "lemmatizer"])

# Function to read a file
def read_file(filepath):
    with open(filepath, 'r') as file:
        return file.read()

# Function to remove punctuation
def separate_punc(md_text):
    return [token.text.lower() for token in nlp(md_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n']

# Function to extract code from notebooks
def extract_code_from_notebooks(directory):
    code_files_content = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.ipynb'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    try:
                        notebook = read(f, NO_CONVERT)
                        code_cells = [cell['source'] for cell in notebook['cells'] if cell['cell_type'] == 'code']
                        code_files_content.append('\n'.join(code_cells))
                    except Exception as e:
                        print(f"Error processing {file_path}: {e}")
    return code_files_content




In [2]:

# Extract code from notebooks
directory = 'C:\\Users\\Lardex\\Desktop\\DU Assginments\\AI_BOOTCAMP\\DU-VIRT-AI-PT-10-2023-U-LOLC'
code_contents = extract_code_from_notebooks(directory)

# Save the extracted code to a CSV file
df_code = pd.DataFrame(code_contents, columns=['code'])
df_code.to_csv('extracted_code.csv', index=False)

# Load CSV into DataFrame
df = pd.read_csv('extracted_code.csv')

# Tokenize the code from the DataFrame
tokens = []
for code in df['code']:
    tokens.extend(separate_punc(code))

# Generate sequences of tokens
train_len = 26
text_sequences = [tokens[i-train_len:i] for i in range(train_len, len(tokens))]

# Initialize and fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

# Convert text sequences to numerical sequences
sequences = tokenizer.texts_to_sequences(text_sequences)
num_sequences = np.array(sequences)

# Prepare input and output variables
X = num_sequences[:,:-1]
y = num_sequences[:,-1]
y = to_categorical(y, num_classes=len(tokenizer.word_index)+1)

  validate(nb)


Error processing C:\Users\Lardex\Desktop\DU Assginments\AI_BOOTCAMP\DU-VIRT-AI-PT-10-2023-U-LOLC\01-Lesson-Plans\20-NLP\1\Activities\03-Ins_Stopwords\Solved\stopwords_solution1.ipynb: Notebook does not appear to be JSON: ''
Error processing C:\Users\Lardex\Desktop\DU Assginments\AI_BOOTCAMP\DU-VIRT-AI-PT-10-2023-U-LOLC\01-Lesson-Plans\21-Transformers\3\Activities\03-Ins_Gradio_Text_Summarization\blocks_gradio.ipynb: Notebook does not appear to be JSON: ''


In [None]:

# Define the LSTM model
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))
    model.add(Dense(vocabulary_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Create and compile the model
model = create_model(len(tokenizer.word_index)+1, X.shape[1])
model.summary()

# Train the model
model.fit(X, y, batch_size=128, epochs=300, verbose=1)

# Save the model and tokenizer
model.save('notebooks_code_model_300.keras')
dump(tokenizer, open('notebooks_code_tokenizer_300', 'wb'))

In [None]:
# Function to generate text
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    output_text = []
    input_text = seed_text
    for _ in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        pred_w = model.predict(pad_encoded, verbose=0)[0]
        pred_word_ind = np.argmax(pred_w, axis=-1)
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text += ' ' + pred_word
        output_text.append(pred_word)
    return ' '.join(output_text)

# Load the trained model and tokenizer
model = load_model('notebooks_code_model_300.keras')
tokenizer = load(open('notebooks_code_tokenizer_300', 'rb'))

# Test the text generation
random_seed_text = ' '.join(text_sequences[randint(0, len(text_sequences))])
print(generate_text(model, tokenizer, X.shape[1], random_seed_text, 25))