In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Bidirectional
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import train_test_split
import clang.cindex
import tempfile
from sklearn.model_selection import KFold



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:

data = pd.read_csv('Data_Ast.csv')
data.head(5)

Unnamed: 0.1,Unnamed: 0,Question,Correct_Code,Code_with_Error,Total_Marks,AST_full
0,0,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\nvoid printFactors(int numb...,7.0,CursorKind.FUNCTION_DECL printFactors\n Curso...
1,1,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\nvoid printFactors(int numb...,8.0,CursorKind.FUNCTION_DECL printFactors\n Curso...
2,2,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\nvoid printFactors(int numb...,5.0,CursorKind.FUNCTION_DECL printFactors\n Curso...
3,3,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\n\nvoid printFactors(int nu...,7.0,CursorKind.FUNCTION_DECL printFactors\n Curso...
4,4,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\n\nvoid printFactors(int nu...,5.0,CursorKind.FUNCTION_DECL printFactors\n Curso...


In [53]:
ast_embeddings = torch.load('combined_batches.pt')

In [54]:
ast_embeddings


tensor([[[[ 8.4504e-03, -4.0104e-01, -3.2706e-01,  ..., -8.0422e-02,
            9.2505e-02, -5.0718e-02],
          [ 3.3696e-01, -4.1935e-01, -3.1969e-01,  ..., -5.2447e-01,
            5.1721e-01,  2.9396e-02],
          [ 3.8042e-01, -2.1906e-02, -4.3508e-01,  ..., -2.9444e-01,
            5.9412e-02,  4.7666e-02],
          ...,
          [ 1.7696e-01, -6.4871e-01, -2.4836e-01,  ..., -1.0296e-01,
            2.0580e-01,  2.2567e-01],
          [-4.1748e-01, -3.9874e-01, -2.1567e-01,  ..., -1.6382e-01,
           -3.3855e-01,  1.6299e-01],
          [-8.4724e-01, -5.2442e-01, -3.2683e-01,  ..., -4.1650e-01,
           -1.5833e-01,  3.5736e-02]],

         [[-1.4072e-01, -3.9876e-01, -8.4772e-02,  ..., -3.6180e-01,
           -5.9882e-01,  2.6484e-01],
          [ 1.8017e-01,  2.4701e-01, -5.8107e-01,  ..., -2.8423e-01,
           -2.5370e-02,  1.2169e-02],
          [ 4.1265e-01,  1.1788e-01, -9.1391e-02,  ..., -2.2127e-01,
            5.2720e-02, -5.7200e-01],
          ...,
     

In [7]:
code_embeddings = torch.load('embeddings_code.pt')

In [9]:
code_embeddings = code_embeddings['code_embeddings']

In [52]:
code_embeddings

tensor([[ 0.0041, -0.2031, -0.1294,  ..., -0.0496, -0.0128, -0.1429],
        [-0.0053, -0.2308, -0.1260,  ..., -0.0506, -0.0143, -0.1440],
        [ 0.0149, -0.2211, -0.1231,  ..., -0.0525, -0.0129, -0.1247],
        ...,
        [ 0.0077, -0.0115, -0.0123,  ..., -0.0076,  0.0037, -0.0278],
        [ 0.0019, -0.0086, -0.0080,  ..., -0.0049,  0.0005, -0.0138],
        [-0.0029, -0.2452, -0.1435,  ..., -0.0112, -0.0482, -0.1252]])

In [11]:
code_embeddings = code_embeddings.squeeze(0).squeeze(1)

In [12]:
code_embeddings.shape

torch.Size([1000, 512, 768])

In [20]:
code_embeddings = code_embeddings[:, 0, :] 
ast_embeddings = ast_embeddings[:,:, 0, :]

In [21]:
code_embeddings.shape

torch.Size([1000, 768])

In [22]:
import torch

# Example embeddings for demonstration
# Assuming code_embeddings has shape (1000, 768)
# Assuming AST embeddings has shape (973, 5, 768)

# Indices to be removed
indices_to_remove = [
    526, 529, 530, 532, 533, 534, 693, 694, 845, 848, 853, 858, 860, 861, 862, 863, 
    864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874
]

# Convert to tensor for easier indexing
indices_to_keep = torch.tensor(
    [i for i in range(code_embeddings.size(0)) if i not in indices_to_remove]
)

# Filter code embeddings
filtered_code_embeddings = code_embeddings[indices_to_keep]  # Shape: (973, 768)


In [50]:
filtered_code_embeddings


tensor([[ 0.0041, -0.2031, -0.1294,  ..., -0.0496, -0.0128, -0.1429],
        [-0.0053, -0.2308, -0.1260,  ..., -0.0506, -0.0143, -0.1440],
        [ 0.0149, -0.2211, -0.1231,  ..., -0.0525, -0.0129, -0.1247],
        ...,
        [ 0.0077, -0.0115, -0.0123,  ..., -0.0076,  0.0037, -0.0278],
        [ 0.0019, -0.0086, -0.0080,  ..., -0.0049,  0.0005, -0.0138],
        [-0.0029, -0.2452, -0.1435,  ..., -0.0112, -0.0482, -0.1252]])

In [25]:
expanded_code_embeddings = filtered_code_embeddings.unsqueeze(1)  # Shape: (973, 1, 768)


In [26]:
combined_embeddings = torch.cat([expanded_code_embeddings, ast_embeddings], dim=1)  # Shape: (973, 6, 768)
combined_embeddings.shape

torch.Size([973, 6, 768])

In [27]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset for embeddings
class CodeASTDataset(Dataset):
    def __init__(self, embeddings):
        self.embeddings = embeddings

    def __len__(self):
        return self.embeddings.size(0)

    def __getitem__(self, idx):
        return self.embeddings[idx]

# Hyperparameters
input_dim = combined_embeddings.shape[-1]  # 768
hidden_dim = 32  # LSTM hidden state size
num_layers = 1  # Single-layer LSTM
batch_size = 32  # Batch size
num_epochs = 100  # Number of epochs for embedding processing

# LSTM Model Definition
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)  # Output: [batch_size, seq_len, hidden_dim]
        return lstm_out  # Return the sequence output

# Prepare dataset and dataloader
dataset = CodeASTDataset(combined_embeddings)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
print(f"Dataset prepared. Number of batches: {len(dataloader)}")

# Initialize LSTM model
model = LSTMModel(input_dim=input_dim, hidden_dim=hidden_dim, num_layers=num_layers).to(device)
print("Model initialized.")

# Process embeddings through LSTM without training
model.eval()  # Set the model to evaluation mode
print("Model set to evaluation mode.")

lstm_processed_embeddings = []
with torch.no_grad():  # No gradient computation for embedding processing
    print("Starting LSTM processing...")
    for batch_idx, inputs in enumerate(dataloader):
        inputs = inputs.to(device)
        lstm_out = model(inputs)  # Shape: [batch_size, seq_len, hidden_dim]
        lstm_processed_embeddings.append(lstm_out.cpu())
        if batch_idx % 10 == 0:  # Print progress every 10 batches
            print(f"Processed batch {batch_idx + 1}/{len(dataloader)}")

# Concatenate processed embeddings
lstm_processed_embeddings = torch.cat(lstm_processed_embeddings, dim=0)  # Shape: [973, seq_len, hidden_dim]
print(f"LSTM processing complete. Processed embeddings shape: {lstm_processed_embeddings.shape}")

# Save the LSTM-processed embeddings
torch.save(lstm_processed_embeddings, 'lstm_processed_embeddings.pt')
print("LSTM-processed embeddings saved as 'lstm_processed_embeddings.pt'!")


Dataset prepared. Number of batches: 31
Model initialized.
Model set to evaluation mode.
Starting LSTM processing...
Processed batch 1/31
Processed batch 11/31
Processed batch 21/31
Processed batch 31/31
LSTM processing complete. Processed embeddings shape: torch.Size([973, 6, 32])
LSTM-processed embeddings saved as 'lstm_processed_embeddings.pt'!


In [28]:
lstm_processed_embeddings

tensor([[[-0.0054,  0.0561, -0.0689,  ..., -0.0692,  0.0152, -0.1012],
         [-0.0057, -0.1729, -0.2150,  ..., -0.0027,  0.0081,  0.0578],
         [ 0.0727, -0.0278, -0.0828,  ...,  0.0340, -0.1301,  0.1096],
         [ 0.0252, -0.0171, -0.0350,  ...,  0.0154, -0.0913,  0.0453],
         [ 0.0218, -0.0103, -0.0016,  ..., -0.0174, -0.0669,  0.0288],
         [ 0.0278, -0.0039,  0.0130,  ..., -0.0323, -0.0543,  0.0164]],

        [[-0.0017,  0.0561, -0.0569,  ..., -0.0689,  0.0209, -0.1195],
         [-0.0644, -0.1390, -0.2368,  ..., -0.0265,  0.0085,  0.0794],
         [-0.1686,  0.1635, -0.2165,  ...,  0.4965,  0.1077, -0.1807],
         [-0.1066,  0.1326, -0.1659,  ...,  0.2049, -0.0008, -0.0835],
         [-0.0431,  0.0801, -0.0728,  ...,  0.0910, -0.0358, -0.0385],
         [-0.0029,  0.0503, -0.0234,  ...,  0.0307, -0.0492, -0.0185]],

        [[-0.0011,  0.0566, -0.0640,  ..., -0.0686,  0.0266, -0.1106],
         [-0.0021, -0.1715, -0.2138,  ..., -0.0029,  0.0151,  0.0519],
  

In [48]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import KFold

# Flatten LSTM-processed embeddings
lstm_processed_embeddings_flattened = lstm_processed_embeddings.reshape(lstm_processed_embeddings.size(0), -1).numpy()  # Shape: (973, seq_len * hidden_dim)

# Assuming you have a DataFrame `data` with a column `Total_Marks` for actual values
actual_values = data["Total_Marks"].to_numpy()

# Cross-validation setup
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Initialize models
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, max_depth=8, learning_rate=0.1, random_state=42),
    "CatBoost": CatBoostRegressor(iterations=500, learning_rate=0.1, depth=8, verbose=0),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor(n_neighbors=5)
}

# Dictionary to store predicted values for each model
predicted_values = {model_name: np.zeros(len(actual_values)) for model_name in models.keys()}

# Store metrics
metrics = {
    "Model": [],
    "MAPE": [],
    "R²": [],
    "RMSE": [],
}

# Loop over each model to train and evaluate
for model_name, model in models.items():
    print(f"\nTraining {model_name} model...")
    
    # Initialize lists to store metrics for each fold
    fold_metrics = {
        "R²": [],
        "RMSE": [],
        "MAPE": []
    }
    
    # Perform 10-fold CV
    for fold_idx, (train_idx, test_idx) in enumerate(kf.split(lstm_processed_embeddings_flattened)):
        print(f"\nProcessing Fold {fold_idx + 1}...")
        
        # Split data
        X_train, X_test = lstm_processed_embeddings_flattened[train_idx], lstm_processed_embeddings_flattened[test_idx]
        y_train, y_test = actual_values[train_idx], actual_values[test_idx]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict on test set
        y_pred = model.predict(X_test)
        predicted_values[model_name][test_idx] = y_pred  # Store predictions
        
        # Calculate metrics
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mape = mean_absolute_percentage_error(y_test, y_pred)
        
        # Store fold metrics
        fold_metrics["R²"].append(r2)
        fold_metrics["RMSE"].append(rmse)
        fold_metrics["MAPE"].append(mape)
        
        print(f"Fold {fold_idx + 1} Results: R² = {r2:.4f}, RMSE = {rmse:.4f}, MAPE = {mape:.4f}")

    # Calculate average metrics across folds
    avg_r2 = np.mean(fold_metrics["R²"])
    avg_rmse = np.mean(fold_metrics["RMSE"])
    avg_mape = np.mean(fold_metrics["MAPE"])

    # Store the model results
    metrics["Model"].append(model_name)
    metrics["MAPE"].append(avg_mape)
    metrics["R²"].append(avg_r2)
    metrics["RMSE"].append(avg_rmse)

    print(f"\n{model_name} - Average Results: R² = {avg_r2:.4f}, RMSE = {avg_rmse:.4f}, MAPE = {avg_mape:.4f}")

# Save predicted values into the DataFrame
for model_name, predictions in predicted_values.items():
    data[f"{model_name}_Predictions"] = predictions

# Convert metrics into DataFrame
scores_df = pd.DataFrame(metrics)

# Save metrics DataFrame to CSV
csv_filename = "AST_model_evaluation_scores.csv"
scores_df.to_csv(csv_filename, index=False)
print(f"Evaluation scores saved to {csv_filename}")

# Save the updated DataFrame with predictions
# Ensure high precision and proper data types during saving
data.to_csv("dataset_with_predictions.csv", float_format="%.15f", index=False)
print(f"Updated dataset with predictions saved to dataset_with_predictions.csv")




Training Random Forest model...

Processing Fold 1...
Fold 1 Results: R² = 0.1179, RMSE = 2.3200, MAPE = 0.5274

Processing Fold 2...
Fold 2 Results: R² = 0.0126, RMSE = 2.0892, MAPE = 0.3306

Processing Fold 3...
Fold 3 Results: R² = 0.1176, RMSE = 2.2069, MAPE = 0.4937

Processing Fold 4...
Fold 4 Results: R² = 0.1846, RMSE = 2.0296, MAPE = 0.3910

Processing Fold 5...
Fold 5 Results: R² = 0.0412, RMSE = 2.1660, MAPE = 0.4276

Processing Fold 6...
Fold 6 Results: R² = 0.0090, RMSE = 2.1488, MAPE = 0.4619

Processing Fold 7...
Fold 7 Results: R² = -0.0437, RMSE = 2.1214, MAPE = 0.3378

Processing Fold 8...
Fold 8 Results: R² = 0.1644, RMSE = 1.8773, MAPE = 0.2962

Processing Fold 9...
Fold 9 Results: R² = -0.0985, RMSE = 2.5431, MAPE = 0.6575

Processing Fold 10...
Fold 10 Results: R² = 0.2203, RMSE = 1.8976, MAPE = 0.4167

Random Forest - Average Results: R² = 0.0725, RMSE = 2.1400, MAPE = 0.4340

Training XGBoost model...

Processing Fold 1...
Fold 1 Results: R² = 0.0384, RMSE = 2.

In [44]:
data.head()

Unnamed: 0.1,Unnamed: 0,Question,Correct_Code,Code_with_Error,Total_Marks,AST_full,Random Forest_Predictions,XGBoost_Predictions,CatBoost_Predictions,SVR_Predictions,KNN_Predictions
0,0,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\nvoid printFactors(int numb...,7.0,CursorKind.FUNCTION_DECL printFactors\n Curso...,7.283333,7.816525,8.468865,5.944712,7.2
1,1,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\nvoid printFactors(int numb...,8.0,CursorKind.FUNCTION_DECL printFactors\n Curso...,5.73,5.663519,5.276315,6.436158,5.8
2,2,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\nvoid printFactors(int numb...,5.0,CursorKind.FUNCTION_DECL printFactors\n Curso...,6.219,6.341347,6.023093,6.106493,5.0
3,3,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\n\nvoid printFactors(int nu...,7.0,CursorKind.FUNCTION_DECL printFactors\n Curso...,6.23,6.005907,6.145343,6.039964,7.2
4,4,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\n\nvoid printFactors(int nu...,5.0,CursorKind.FUNCTION_DECL printFactors\n Curso...,6.105,5.879383,6.063354,6.685574,6.8


In [40]:
scores_df

Unnamed: 0,Model,MAPE,R²,RMSE
0,Random Forest,150214000000000.0,0.185176,2.057001
1,XGBoost,156698300000000.0,0.089082,2.17537
2,CatBoost,153527600000000.0,0.111613,2.146428
3,SVR,176678000000000.0,0.062816,2.208071
4,KNN,145549700000000.0,0.064147,2.203599


In [45]:
data = data[data['Total_Marks']>0]

In [46]:
r2_score(data["Total_Marks"], data['Random Forest_Predictions'])

-0.11338322362940345

In [49]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import numpy as np

# Assuming `data` is the DataFrame containing the relevant data
# data = pd.read_csv("dataset_with_predictions.csv")

# Extract actual values and predicted columns
actual_values = data["Total_Marks"]
predicted_columns = {
    "Random Forest": data["Random Forest_Predictions"],
    "XGBoost": data["XGBoost_Predictions"],
    "CatBoost": data["CatBoost_Predictions"],
    "SVR": data["SVR_Predictions"],
    "KNN": data["KNN_Predictions"],
}

# Initialize a dictionary to store results
results = {
    "Model": [],
    "MAPE": [],
    "R²": [],
    "RMSE": [],
}

# Calculate metrics for each model
for model_name, predictions in predicted_columns.items():
    # Ensure predictions are numpy arrays
    predictions = predictions.to_numpy()
    
    # Calculate metrics
    mape = mean_absolute_percentage_error(actual_values, predictions)
    r2 = r2_score(actual_values, predictions)
    rmse = np.sqrt(mean_squared_error(actual_values, predictions))
    
    # Append metrics to results
    results["Model"].append(model_name)
    results["MAPE"].append(mape)
    results["R²"].append(r2)
    results["RMSE"].append(rmse)

# Convert results dictionary to a DataFrame
scores_df = pd.DataFrame(results)

# Save results to a CSV file
csv_filename = "AST_model_evaluation_scores.csv"
scores_df.to_csv(csv_filename, index=False)

print(f"Evaluation scores saved to {csv_filename}")

# Print the results DataFrame
print(scores_df)


Evaluation scores saved to AST_model_evaluation_scores.csv
           Model      MAPE        R²      RMSE
0  Random Forest  0.446055 -0.113383  2.351869
1        XGBoost  0.453068 -0.207355  2.449110
2       CatBoost  0.453273 -0.179614  2.420810
3            SVR  0.488125 -0.149894  2.390120
4            KNN  0.460844 -0.198793  2.440411


In [47]:
# Save predictions and dataset with high precision
data.to_csv("dataset_with_predictions.csv", float_format="%.15f", index=False)

# Reload the saved dataset
reloaded_data = pd.read_csv("dataset_with_predictions.csv")

# Ensure predictions are numeric
for col in ["Random Forest_Predictions", "XGBoost_Predictions", "CatBoost_Predictions", "SVR_Predictions", "KNN_Predictions"]:
    reloaded_data[col] = pd.to_numeric(reloaded_data[col], errors='coerce')

# Ensure proper alignment (filter applied consistently)
reloaded_data = reloaded_data[reloaded_data['Total_Marks'] > 0]

# Evaluate metrics
results = {"Model": [], "MAPE": [], "R²": [], "RMSE": []}
for model_name in ["Random Forest", "XGBoost", "CatBoost", "SVR", "KNN"]:
    predictions = reloaded_data[f"{model_name}_Predictions"]
    actuals = reloaded_data["Total_Marks"]
    mape = mean_absolute_percentage_error(actuals, predictions)
    r2 = r2_score(actuals, predictions)
    rmse = np.sqrt(mean_squared_error(actuals, predictions))

    results["Model"].append(model_name)
    results["MAPE"].append(mape)
    results["R²"].append(r2)
    results["RMSE"].append(rmse)

# Save the scores
scores_df = pd.DataFrame(results)
scores_df.to_csv("AST_model_evaluation_scores.csv", index=False)
print(scores_df)


           Model      MAPE        R²      RMSE
0  Random Forest  0.446055 -0.113383  2.351869
1        XGBoost  0.453068 -0.207355  2.449110
2       CatBoost  0.453273 -0.179614  2.420810
3            SVR  0.488125 -0.149894  2.390120
4            KNN  0.460844 -0.198793  2.440411
