In [8]:
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import logging
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
logger = logging.getLogger(__name__)

# Load the CSV file directly into a DataFrame
try:
    codeDf = pd.read_csv('processed_code.csv', encoding='utf-8')
    logger.info("CSV file loaded successfully.")
except FileNotFoundError as e:
    logger.error(f"CSV file not found: {e}")
    raise
except pd.errors.ParserError as e:
    logger.error(f"Error parsing CSV file: {e}")
    raise
except Exception as e:
    logger.error(f"An unexpected error occurred: {e}")
    raise

# Clean a dataframe function
def clean_dataframe(df):
    for column in df.columns:
        df[column] = df[column].astype(str).str.strip().str.replace('"', '')
    return df

2024-06-04 12:53:08,233 - CSV file loaded successfully.


In [9]:
# Clean the DataFrame content
try:
    codeDf = clean_dataframe(codeDf)
    logger.info("DataFrame cleaned successfully.")
except Exception as e:
    logger.error(f"An error occurred while cleaning the DataFrame: {e}")
    raise

2024-06-04 12:53:10,169 - DataFrame cleaned successfully.


In [10]:
# Check if the 'word' column is in the DataFrame
if 'word' not in codeDf.columns:
    logger.error("Required column 'word' is not found in the DataFrame.")
    raise KeyError("Required column 'word' is not found in the DataFrame.")

# Tokenize the words
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(codeDf['word'])

word_sequences = tokenizer.texts_to_sequences(codeDf['word'])

sequence_length = 50  # Adjust based on your needs

In [12]:

sequences = []
current_sequence = []
for word in word_sequences:
    if word:  # Check if the word list is not empty
        if len(current_sequence) < sequence_length:
            current_sequence.append(word[0])
        else:
            sequences.append(current_sequence)
            current_sequence = [word[0]]
# Add the last sequence if it's not empty
if current_sequence:
    sequences.append(current_sequence)

# Padding sequences
max_len = max(len(seq) for seq in sequences)
code_padded = pad_sequences(sequences, maxlen=max_len, padding='post')

# Prepare labels (Assuming binary classification: all entries are code)
X = code_padded
y = np.ones(len(code_padded))  # Using 1 for all code entries, adjust as needed for your use case

# Split data
try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    logger.info("Data split into training and testing sets successfully.")
except ValueError as e:
    logger.error(f"Error during train-test split: {e}")
    raise


2024-06-04 12:54:52,280 - Data split into training and testing sets successfully.


In [None]:
# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_len),
    tf.keras.layers.LSTM(150, return_sequences=True),
    tf.keras.layers.LSTM(150, return_sequences=True),
    tf.keras.layers.LSTM(150),
    tf.keras.layers.Dense(len(tokenizer.word_index) + 1, activation='relu')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=100, batch_size=100, validation_split=0.2)

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

In [None]:
# Save the model to a file
model.save('teaching_model.h5')
print("Model saved to 'teaching_model.h5'")