In [1]:
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import logging
from io import StringIO
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
logger = logging.getLogger(__name__)


cleaned_lines = []
with open('processed_code.csv', 'r', encoding='utf-8') as file:
    for line in file:
        # Strip leading and trailing whitespaces and remove unclosed quotes
        cleaned_lines.append(line.strip().replace('"', ''))
cleaned_content = "\n".join(cleaned_lines)




In [2]:
# Load the CSV file
# codeDf = pd.read_csv(cleaned_content)

df_sample = cleaned_content.sample(frac=0.1, random_state=42)  # Using 10% of the data

# Extract comments and code from the DataFrame and ensure all entries are strings
df_sample['comments'] = df_sample['comments'].astype(str)
df_sample['code'] = df_sample['code'].astype(str)
comments = df_sample['comments'].dropna().tolist()
code = df_sample['code'].dropna().tolist()

AttributeError: 'str' object has no attribute 'sample'

In [None]:

# Ensure there is data to process
if not comments or not code:
    logger.error("No comments or code found in the CSV file.")
    raise ValueError("The CSV file does not contain any comments or code.")

# Tokenize the comments and code
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(comments + code)

comments_sequences = tokenizer.texts_to_sequences(comments)
code_sequences = tokenizer.texts_to_sequences(code)

# Padding sequences
max_len = max(max(len(seq) for seq in comments_sequences), max(len(seq) for seq in code_sequences))
comments_padded = pad_sequences(comments_sequences, maxlen=max_len, padding='post')
code_padded = pad_sequences(code_sequences, maxlen=max_len, padding='post')

# Combine data for training
X = np.vstack((comments_padded, code_padded))
y = np.array([0] * len(comments_padded) + [1] * len(code_padded))  # Assuming binary classification: 0 for comments, 1 for code

# Split data
try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
except ValueError as e:
    logger.error(f"Error during train-test split: {e}")
    raise


In [None]:

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_len),
    tf.keras.layers.LSTM(128, return_sequences=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 50.00%
