In [10]:
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import logging
from io import StringIO
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
logger = logging.getLogger(__name__)

# Load the CSV file directly into a DataFrame
try:
    codeDf = pd.read_csv('processed_code.csv', encoding='utf-8')
    logger.info("CSV file loaded successfully.")
except FileNotFoundError as e:
    logger.error(f"CSV file not found: {e}")
    raise
except pd.errors.ParserError as e:
    logger.error(f"Error parsing CSV file: {e}")
    raise
except Exception as e:
    logger.error(f"An unexpected error occurred: {e}")
    raise

# Clean a dataframe function
def clean_dataframe(df):
    for column in df.columns:
        df[column] = df[column].astype(str).str.strip().str.replace('"', '')
    return df

2024-05-28 13:47:11,553 - CSV file loaded successfully.


In [11]:
# Clean the DataFrame content
try:
    codeDf = clean_dataframe(codeDf)
    logger.info("DataFrame cleaned successfully.")
except Exception as e:
    logger.error(f"An error occurred while cleaning the DataFrame: {e}")
    raise

2024-05-28 13:47:13,445 - DataFrame cleaned successfully.


In [12]:
# Check if the 'code' column is in the DataFrame
if 'code' not in codeDf.columns:
    logger.error("Required column 'code' is not found in the DataFrame.")
    raise KeyError("Required column 'code' is not found in the DataFrame.")

# Tokenize the code
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(codeDf['code'])

code_sequences = tokenizer.texts_to_sequences(codeDf['code'])

# Padding sequences
max_len = max(len(seq) for seq in code_sequences)
code_padded = pad_sequences(code_sequences, maxlen=max_len, padding='post')

# Prepare labels (Assuming binary classification: all entries are code)
X = code_padded
y = np.ones(len(code_padded))  # Using 1 for all code entries, adjust as needed for your use case

# Split data
try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    logger.info("Data split into training and testing sets successfully.")
except ValueError as e:
    logger.error(f"Error during train-test split: {e}")
    raise


2024-05-28 13:47:15,988 - Data split into training and testing sets successfully.


In [24]:
# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_len),
    tf.keras.layers.LSTM(150, return_sequences=True),
    tf.keras.layers.LSTM(150, return_sequences=True),
    tf.keras.layers.LSTM(150),
    tf.keras.layers.Dense(len(tokenizer.word_index) + 1, activation='relu')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 1956, 128)         684672    
                                                                 
 lstm_16 (LSTM)              (None, 1956, 150)         167400    
                                                                 
 lstm_17 (LSTM)              (None, 1956, 150)         180600    
                                                                 
 lstm_18 (LSTM)              (None, 150)               180600    
                                                                 
 dense_5 (Dense)             (None, 5349)              807699    
                                                                 
Total params: 2020971 (7.71 MB)
Trainable params: 2020971 (7.71 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
history = model.fit(X_train, y_train, epochs=128, batch_size=300, validation_split=0.2)

Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
Epoch 11/128
Epoch 12/128
Epoch 13/128
Epoch 14/128
Epoch 15/128
Epoch 16/128
Epoch 17/128
Epoch 18/128
Epoch 19/128
Epoch 20/128
Epoch 21/128
Epoch 22/128
Epoch 23/128
Epoch 24/128

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 100.00%
