In [1]:
import os
import pandas as pd
import spacy
from nbformat import read, NO_CONVERT
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from random import randint
from pickle import dump, load
from keras_preprocessing.sequence import pad_sequences
import numpy as np

# Load SpaCy large English model and disable unnecessary components
nlp = spacy.load('en_core_web_lg', disable=["tagger", "ner", "lemmatizer"])

# Function to remove punctuation
def separate_punc(md_text):
    return [token.text.lower() for token in nlp(md_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n']

# Define the LSTM model
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))
    model.add(Dense(vocabulary_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model




In [2]:
# Load CSV into DataFrame
df = pd.read_csv('extracted_code.csv')

# Tokenize the code from the DataFrame
tokens = []
for code in df['code']:
    tokens.extend(separate_punc(code))

# Generate sequences of tokens
train_len = 26
text_sequences = [tokens[i-train_len:i] for i in range(train_len, len(tokens))]

# Initialize and fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

# Convert text sequences to numerical sequences
sequences = tokenizer.texts_to_sequences(text_sequences)
num_sequences = np.array(sequences)

# Prepare input and output variables
X = num_sequences[:,:-1]
y = num_sequences[:,-1]
y = to_categorical(y, num_classes=len(tokenizer.word_index)+1)

In [8]:
# Create and compile the model
model = create_model(len(tokenizer.word_index)+1, X.shape[1])
model.summary()

# Train the model
model.fit(X, y, batch_size=100, epochs=100, verbose=1)

# Save the model and tokenizer
model.save('notebooks_code_model_300.keras')
dump(tokenizer, open('notebooks_code_tokenizer_300', 'wb'))

KeyboardInterrupt: 