In [2]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Bidirectional
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import train_test_split
import clang.cindex
import tempfile
from sklearn.model_selection import KFold


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
data = pd.read_csv('Data_AST.csv')

In [4]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,Question,Correct_Code,Code_with_Error,Total_Marks,AST_full
0,0,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\nvoid printFactors(int numb...,7.0,CursorKind.FUNCTION_DECL printFactors\n Curso...
1,1,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\nvoid printFactors(int numb...,8.0,CursorKind.FUNCTION_DECL printFactors\n Curso...
2,2,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\nvoid printFactors(int numb...,5.0,CursorKind.FUNCTION_DECL printFactors\n Curso...
3,3,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\n\nvoid printFactors(int nu...,7.0,CursorKind.FUNCTION_DECL printFactors\n Curso...
4,4,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\n\nvoid printFactors(int nu...,5.0,CursorKind.FUNCTION_DECL printFactors\n Curso...


In [30]:
data = data[data['Total_Marks']>0]

In [31]:
len(data)

994

In [5]:
loaded_embeddings = torch.load('embeddings_code.pt',map_location=torch.device('cpu'))

In [6]:
code_embeddings = loaded_embeddings['code_embeddings']

In [7]:
code_embeddings.shape

torch.Size([1, 1000, 1, 512, 768])

In [31]:
code_embeddings[0][0][0][0][0]

tensor(0.0041)

In [32]:
flattened_embedding = code_embeddings.reshape(-1, 512 * 768)  # Flatten the feature dimensions

In [33]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(flattened_embedding.cpu().numpy())  # Normalize


In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

def run_rf_regressor_cv(embeddings, targets, n_folds=10, output_dir="results"):
    """
    Runs 10-fold cross-validation using Random Forest Regressor.
    Calculates and stores the average MAPE, RMSE, and R² values for both train and test sets.
    Saves the average results to a CSV file.
    
    Parameters:
    - embeddings (torch.Tensor): Input embeddings with shape [1, 1000, 1, 512, 768].
    - targets (np.ndarray): Target values with shape [n_samples].
    - n_folds (int): Number of folds for cross-validation (default: 10).
    - output_dir (str): Directory to save the results CSV files (default: "results").
    
    Returns:
    - results (dict): Dictionary containing the average MAPE, RMSE, and R² scores.
    - test_predictions (np.ndarray): Array of predictions from the test set for each fold.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
     # Assuming embeddings has shape (1000, 512, 768), i.e., 1000 samples, each with 512 tokens and 768 features.
    embeddings = embeddings.squeeze(0)  # Remove the singleton dimension (1, 1000, 512, 768) -> (1000, 512, 768)
    
    # Select the final token's embedding for each sample (the 512th token)
    final_layer_embeddings = embeddings[:, -1, :]  # Shape will be (1000, 768), one embedding for each sample
    
    # Convert to NumPy
    X = final_layer_embeddings.cpu().numpy() if isinstance(final_layer_embeddings, torch.Tensor) else final_layer_embeddings
    
    print(f"Final layer embeddings shape: {X.shape}")
    
    # Use the target values
    y = np.array(targets)
    
    # Standardize features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    # Initialize Random Forest Regressor
    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
    
    # Initialize results storage
    train_mape_list = []
    test_mape_list = []
    train_rmse_list = []
    test_rmse_list = []
    train_r2_list = []
    test_r2_list = []
    
    # Store test predictions
    test_predictions = []
    
    # Running Cross-Validation
    kf = KFold(n_splits=n_folds)  # Set the random_state for reproducibility
    
    for fold, (train_index, test_index) in tqdm(enumerate(kf.split(X), 1), desc="Folds"):
        # Train-test split
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Train the model
        rf.fit(X_train, y_train)
        
        # Predictions
        y_train_pred = rf.predict(X_train)
        y_test_pred = rf.predict(X_test)
        
        # Calculate metrics
        train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
        test_mape = mean_absolute_percentage_error(y_test, y_test_pred)
        
        train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)  # RMSE
        test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)    # RMSE
        
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        
        # Append the metrics for this fold
        train_mape_list.append(train_mape)
        test_mape_list.append(test_mape)
        train_rmse_list.append(train_rmse)
        test_rmse_list.append(test_rmse)
        train_r2_list.append(train_r2)
        test_r2_list.append(test_r2)
        
        # Store test predictions
        test_predictions.append(y_test_pred)
    
    # Calculate average of the metrics
    avg_train_mape = np.mean(train_mape_list)
    avg_test_mape = np.mean(test_mape_list)
    avg_train_rmse = np.mean(train_rmse_list)
    avg_test_rmse = np.mean(test_rmse_list)
    avg_train_r2 = np.mean(train_r2_list)
    avg_test_r2 = np.mean(test_r2_list)
    
    # Prepare the results as a DataFrame
    output_df = pd.DataFrame({
        "Code": ["Using final token's embedding"],  # Adjust description
        "Regressor": ["Random Forest"],  # Specify the regressor type
        "Train_R²": [avg_train_r2],
        "Train_RMSE": [avg_train_rmse],
        "Train_MAPE": [avg_train_mape],
        "Test_R²": [avg_test_r2],
        "Test_RMSE": [avg_test_rmse],
        "Test_MAPE": [avg_test_mape]
    })

    # Save the average results to CSV file
    output_file = os.path.join(output_dir, "random_forest_avg_results.csv")
    output_df.to_csv(output_file, index=False)

    # Report average results
    print("10-Fold CV Results (Averages):")
    print(f"Mean Train MAPE: {avg_train_mape:.4f}")
    print(f"Mean Test MAPE: {avg_test_mape:.4f}")
    print(f"Mean Train RMSE: {avg_train_rmse:.4f}")
    print(f"Mean Test RMSE: {avg_test_rmse:.4f}")
    print(f"Mean Train R²: {avg_train_r2:.4f}")
    print(f"Mean Test R²: {avg_test_r2:.4f}")
    
    print(f"Results saved to {output_file}")
    
    # Concatenate all test predictions from each fold into a single array
    test_predictions = np.concatenate(test_predictions, axis=0)
    
    return {
        "Train_MAPE": avg_train_mape,
        "Test_MAPE": avg_test_mape,
        "Train_RMSE": avg_train_rmse,
        "Test_RMSE": avg_test_rmse,
        "Train_R²": avg_train_r2,
        "Test_R²": avg_test_r2
    }, test_predictions


In [11]:
target = data['Total_Marks']
results, data['RF_predicted_value_code'] = run_rf_regressor_cv(code_embeddings, target)

Final layer embeddings shape: (1000, 512, 768)


ValueError: Found array with dim 3. StandardScaler expected <= 2.

In [11]:
results

{'Train_MAPE': 36741866959964.43,
 'Test_MAPE': 148123391744216.06,
 'Train_RMSE': 0.7075948504117413,
 'Test_RMSE': 2.2847412513157095,
 'Train_R²': 0.9029128438531018,
 'Test_R²': -0.3280744889999238}

In [12]:
r_square = r2_score(data['Total_Marks'], data['RF_predicted_value_code'])

In [13]:
r_square

-0.025877597084763115

In [44]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from sklearn.decomposition import PCA

def run_xgb_regressor_cv(embeddings, targets, n_folds=10, random_state=42, output_dir="results"):
    """
    Runs 10-fold cross-validation using XGBoost Regressor.
    Calculates and stores the average MAPE, RMSE, and R² values for both train and test sets.
    Saves the average results to a CSV file.
    
    Parameters:
    - embeddings (torch.Tensor): Input embeddings with shape [1, seq_len, 1, feature1, feature2].
    - targets (np.ndarray): Target values with shape [n_samples].
    - n_folds (int): Number of folds for cross-validation (default: 10).
    - random_state (int): Random seed for reproducibility.
    - output_dir (str): Directory to save the results CSV files (default: "results").
    
    Returns:
    - results (dict): Dictionary containing the average MAPE, RMSE, and R² scores.
    - test_predictions (np.ndarray): Array of predictions from the test set for each fold.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Flatten the embeddings
    flattened_embeddings = embeddings.reshape(-1, embeddings.shape[-2] * embeddings.shape[-1])
    
    # Convert to NumPy
    X = flattened_embeddings.cpu().numpy() if isinstance(flattened_embeddings, torch.Tensor) else flattened_embeddings
    y = np.array(targets)
    
    # Apply PCA for dimensionality reduction 
    pca = PCA(n_components=50)
    X = pca.fit_transform(X)
    
    # Standardize features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    # Initialize XGBoost Regressor
    xgb_reg = xgb.XGBRegressor(n_estimators=100, random_state=random_state, n_jobs=-1)
    
    # Initialize results storage
    train_mape_list = []
    test_mape_list = []
    train_rmse_list = []
    test_rmse_list = []
    train_r2_list = []
    test_r2_list = []
    
    # Store test predictions
    test_predictions = []
    
    # Running Cross-Validation
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)  # Set the random_state for reproducibility
    
    for fold, (train_index, test_index) in tqdm(enumerate(kf.split(X), 1), desc="Folds"):
        # Train-test split
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Train the model
        xgb_reg.fit(X_train, y_train)
        
        # Predictions
        y_train_pred = xgb_reg.predict(X_train)
        y_test_pred = xgb_reg.predict(X_test)
        
        # Calculate metrics
        train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
        test_mape = mean_absolute_percentage_error(y_test, y_test_pred)
        
        train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)  # RMSE
        test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)    # RMSE
        
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        
        # Append the metrics for this fold
        train_mape_list.append(train_mape)
        test_mape_list.append(test_mape)
        train_rmse_list.append(train_rmse)
        test_rmse_list.append(test_rmse)
        train_r2_list.append(train_r2)
        test_r2_list.append(test_r2)
        
        # Store test predictions
        test_predictions.append(y_test_pred)
    
    # Calculate average of the metrics
    avg_train_mape = np.mean(train_mape_list)
    avg_test_mape = np.mean(test_mape_list)
    avg_train_rmse = np.mean(train_rmse_list)
    avg_test_rmse = np.mean(test_rmse_list)
    avg_train_r2 = np.mean(train_r2_list)
    avg_test_r2 = np.mean(test_r2_list)
    
    # Prepare the results as a DataFrame
    output_df = pd.DataFrame({
        "Code": ["Using only AST"],  # Repeat for the row
        "Regressor": ["XGBoost"],  # Specify the regressor type
        "Train_R²": [avg_train_r2],
        "Train_RMSE": [avg_train_rmse],
        "Train_MAPE": [avg_train_mape],
        "Test_R²": [avg_test_r2],
        "Test_RMSE": [avg_test_rmse],
        "Test_MAPE": [avg_test_mape]
    })

    # Save the average results to CSV file
    output_file = os.path.join(output_dir, "xgboost_avg_results.csv")
    output_df.to_csv(output_file, index=False)

    # Report average results
    print("10-Fold CV Results (Averages):")
    print(f"Mean Train MAPE: {avg_train_mape:.4f}")
    print(f"Mean Test MAPE: {avg_test_mape:.4f}")
    print(f"Mean Train RMSE: {avg_train_rmse:.4f}")
    print(f"Mean Test RMSE: {avg_test_rmse:.4f}")
    print(f"Mean Train R²: {avg_train_r2:.4f}")
    print(f"Mean Test R²: {avg_test_r2:.4f}")
    
    print(f"Results saved to {output_file}")
    
    # Concatenate all test predictions from each fold into a single array
    test_predictions = np.concatenate(test_predictions, axis=0)
    
    return {
        "Train_MAPE": avg_train_mape,
        "Test_MAPE": avg_test_mape,
        "Train_RMSE": avg_train_rmse,
        "Test_RMSE": avg_test_rmse,
        "Train_R²": avg_train_r2,
        "Test_R²": avg_test_r2
    }, test_predictions


In [45]:
XGBR_results, data['XGBR_predicted_value_code'] = run_xgb_regressor_cv(code_embeddings, target)

Folds: 10it [00:01,  5.33it/s]

10-Fold CV Results (Averages):
Mean Train MAPE: 1255581581807.1975
Mean Test MAPE: 83908050905203.0312
Mean Train RMSE: 0.4846
Mean Test RMSE: 2.0385
Mean Train R²: 0.9544
Mean Test R²: 0.1768
Results saved to results/xgboost_avg_results.csv





In [1]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from sklearn.decomposition import PCA

def run_knn_regressor_cv(embeddings, targets, n_folds=10, random_state=42, output_dir="results"):
    """
    Runs 10-fold cross-validation using K-Nearest Neighbors Regressor.
    Calculates and stores the average MAPE, RMSE, and R² values for both train and test sets.
    Saves the average results to a CSV file.
    
    Parameters:
    - embeddings (torch.Tensor): Input embeddings with shape [1, seq_len, 1, feature1, feature2].
    - targets (np.ndarray): Target values with shape [n_samples].
    - n_folds (int): Number of folds for cross-validation (default: 10).
    - random_state (int): Random seed for reproducibility.
    - output_dir (str): Directory to save the results CSV files (default: "results").
    
    Returns:
    - results (dict): Dictionary containing the average MAPE, RMSE, and R² scores.
    - test_predictions (np.ndarray): Array of predictions from the test set for each fold.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Flatten the embeddings
    flattened_embeddings = embeddings.reshape(-1, embeddings.shape[-2] * embeddings.shape[-1])
    
    # Convert to NumPy
    X = flattened_embeddings.cpu().numpy() if isinstance(flattened_embeddings, torch.Tensor) else flattened_embeddings
    print(X.shape)
    y = np.array(targets)
    
    # Apply PCA for dimensionality reduction (optional)
    pca = PCA(n_components=50)
    X = pca.fit_transform(X)
    print(X.shape)
    
    # Standardize features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    # Initialize K-Nearest Neighbors Regressor
    knn_reg = KNeighborsRegressor(n_neighbors=5)
    
    # Initialize results storage
    train_mape_list = []
    test_mape_list = []
    train_rmse_list = []
    test_rmse_list = []
    train_r2_list = []
    test_r2_list = []
    
    # Store test predictions
    test_predictions = []
    
    # Running Cross-Validation
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    
    for fold, (train_index, test_index) in tqdm(enumerate(kf.split(X), 1), desc="Folds"):
        # Train-test split
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Train the model
        knn_reg.fit(X_train, y_train)
        
        # Predictions
        y_train_pred = knn_reg.predict(X_train)
        y_test_pred = knn_reg.predict(X_test)
        
        # Calculate metrics
        train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
        test_mape = mean_absolute_percentage_error(y_test, y_test_pred)
        
        train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)  # RMSE
        test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)    # RMSE
        
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        
        # Append the metrics for this fold
        train_mape_list.append(train_mape)
        test_mape_list.append(test_mape)
        train_rmse_list.append(train_rmse)
        test_rmse_list.append(test_rmse)
        train_r2_list.append(train_r2)
        test_r2_list.append(test_r2)
        
        # Store test predictions
        test_predictions.append(y_test_pred)
    
    # Calculate average of the metrics
    avg_train_mape = np.mean(train_mape_list)
    avg_test_mape = np.mean(test_mape_list)
    avg_train_rmse = np.mean(train_rmse_list)
    avg_test_rmse = np.mean(test_rmse_list)
    avg_train_r2 = np.mean(train_r2_list)
    avg_test_r2 = np.mean(test_r2_list)
    
    # Prepare the results as a DataFrame
    output_df = pd.DataFrame({
        "Code": ["Using only AST"],
        "Regressor": ["KNN"],
        "Train_R²": [avg_train_r2],
        "Train_RMSE": [avg_train_rmse],
        "Train_MAPE": [avg_train_mape],
        "Test_R²": [avg_test_r2],
        "Test_RMSE": [avg_test_rmse],
        "Test_MAPE": [avg_test_mape]
    })

    # Save the average results to CSV file
    output_file = os.path.join(output_dir, "knn_avg_results.csv")
    output_df.to_csv(output_file, index=False)

    # Report average results
    print(f"Results saved to {output_file}")
    
    
    # Concatenate all test predictions from each fold into a single array
    test_predictions = np.concatenate(test_predictions, axis=0)
    
    return {
        "Train_MAPE": avg_train_mape,
        "Test_MAPE": avg_test_mape,
        "Train_RMSE": avg_train_rmse,
        "Test_RMSE": avg_test_rmse,
        "Train_R²": avg_train_r2,
        "Test_R²": avg_test_r2
    }, test_predictions


In [49]:
KNN_results, data['KNN_predicted_value_code'] = run_knn_regressor_cv(code_embeddings, target)

Folds: 10it [00:00, 19.79it/s]


Results saved to results/knn_avg_results.csv


In [53]:
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from sklearn.decomposition import PCA

def run_catboost_regressor_cv(embeddings, targets, n_folds=10, random_state=42, output_dir="results"):
    """
    Runs 10-fold cross-validation using CatBoost Regressor.
    Calculates and stores the average MAPE, RMSE, and R² values for both train and test sets.
    Saves the average results to a CSV file.
    
    Parameters:
    - embeddings (torch.Tensor): Input embeddings with shape [1, seq_len, 1, feature1, feature2].
    - targets (np.ndarray): Target values with shape [n_samples].
    - n_folds (int): Number of folds for cross-validation (default: 10).
    - random_state (int): Random seed for reproducibility.
    - output_dir (str): Directory to save the results CSV files (default: "results").
    
    Returns:
    - results (dict): Dictionary containing the average MAPE, RMSE, and R² scores.
    - test_predictions (np.ndarray): Array of predictions from the test set for each fold.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Flatten the embeddings
    flattened_embeddings = embeddings.reshape(-1, embeddings.shape[-2] * embeddings.shape[-1])
    
    # Convert to NumPy
    X = flattened_embeddings.cpu().numpy() if isinstance(flattened_embeddings, torch.Tensor) else flattened_embeddings
    y = np.array(targets)
    
    # Apply PCA for dimensionality reduction (optional)
    pca = PCA(n_components=50)
    X = pca.fit_transform(X)
    
    # Standardize features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    # Initialize CatBoost Regressor
    catboost_reg = CatBoostRegressor(iterations=100, random_state=random_state, silent=True)
    
    # Initialize results storage
    train_mape_list = []
    test_mape_list = []
    train_rmse_list = []
    test_rmse_list = []
    train_r2_list = []
    test_r2_list = []
    
    # Store test predictions
    test_predictions = []
    
    # Running Cross-Validation
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    
    for fold, (train_index, test_index) in tqdm(enumerate(kf.split(X), 1), desc="Folds"):
        # Train-test split
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Train the model
        catboost_reg.fit(X_train, y_train)
        
        # Predictions
        y_train_pred = catboost_reg.predict(X_train)
        y_test_pred = catboost_reg.predict(X_test)
        
        # Calculate metrics
        train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
        test_mape = mean_absolute_percentage_error(y_test, y_test_pred)
        
        train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)  # RMSE
        test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)    # RMSE
        
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        
        # Append the metrics for this fold
        train_mape_list.append(train_mape)
        test_mape_list.append(test_mape)
        train_rmse_list.append(train_rmse)
        test_rmse_list.append(test_rmse)
        train_r2_list.append(train_r2)
        test_r2_list.append(test_r2)
        
        # Store test predictions
        test_predictions.append(y_test_pred)
    
    # Calculate average of the metrics
    avg_train_mape = np.mean(train_mape_list)
    avg_test_mape = np.mean(test_mape_list)
    avg_train_rmse = np.mean(train_rmse_list)
    avg_test_rmse = np.mean(test_rmse_list)
    avg_train_r2 = np.mean(train_r2_list)
    avg_test_r2 = np.mean(test_r2_list)
    
    # Prepare the results as a DataFrame
    output_df = pd.DataFrame({
        "Code": ["Using only AST"],
        "Regressor": ["CatBoost"],
        "Train_R²": [avg_train_r2],
        "Train_RMSE": [avg_train_rmse],
        "Train_MAPE": [avg_train_mape],
        "Test_R²": [avg_test_r2],
        "Test_RMSE": [avg_test_rmse],
        "Test_MAPE": [avg_test_mape]
    })

    # Save the average results to CSV file
    output_file = os.path.join(output_dir, "catboost_avg_results.csv")
    output_df.to_csv(output_file, index=False)

    # Report average results
    print(f"Results saved to {output_file}")
    
    # Concatenate all test predictions from each fold into a single array
    test_predictions = np.concatenate(test_predictions, axis=0)
    
    return {
        "Train_MAPE": avg_train_mape,
        "Test_MAPE": avg_test_mape,
        "Train_RMSE": avg_train_rmse,
        "Test_RMSE": avg_test_rmse,
        "Train_R²": avg_train_r2,
        "Test_R²": avg_test_r2
    }, test_predictions


In [54]:
CATBoost_results, data['CATBoost_predicted_value_code'] = run_catboost_regressor_cv(code_embeddings, target)

Folds: 10it [00:01,  5.07it/s]

Results saved to results/catboost_avg_results.csv





In [55]:
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from sklearn.decomposition import PCA

def run_svr_regressor_cv(embeddings, targets, n_folds=10, random_state=42, output_dir="results"):
    """
    Runs 10-fold cross-validation using Support Vector Regressor.
    Calculates and stores the average MAPE, RMSE, and R² values for both train and test sets.
    Saves the average results to a CSV file.
    
    Parameters:
    - embeddings (torch.Tensor): Input embeddings with shape [1, seq_len, 1, feature1, feature2].
    - targets (np.ndarray): Target values with shape [n_samples].
    - n_folds (int): Number of folds for cross-validation (default: 10).
    - random_state (int): Random seed for reproducibility.
    - output_dir (str): Directory to save the results CSV files (default: "results").
    
    Returns:
    - results (dict): Dictionary containing the average MAPE, RMSE, and R² scores.
    - test_predictions (np.ndarray): Array of predictions from the test set for each fold.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Flatten the embeddings
    flattened_embeddings = embeddings.reshape(-1, embeddings.shape[-2] * embeddings.shape[-1])
    
    # Convert to NumPy
    X = flattened_embeddings.cpu().numpy() if isinstance(flattened_embeddings, torch.Tensor) else flattened_embeddings
    y = np.array(targets)
    
    # Apply PCA for dimensionality reduction (optional)
    pca = PCA(n_components=50)
    X = pca.fit_transform(X)
    
    # Standardize features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    # Initialize Support Vector Regressor
    svr_reg = SVR(kernel='rbf')
    
    # Initialize results storage
    train_mape_list = []
    test_mape_list = []
    train_rmse_list = []
    test_rmse_list = []
    train_r2_list = []
    test_r2_list = []
    
    # Store test predictions
    test_predictions = []
    
    # Running Cross-Validation
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    
    for fold, (train_index, test_index) in tqdm(enumerate(kf.split(X), 1), desc="Folds"):
        # Train-test split
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Train the model
        svr_reg.fit(X_train, y_train)
        
        # Predictions
        y_train_pred = svr_reg.predict(X_train)
        y_test_pred = svr_reg.predict(X_test)
        
        # Calculate metrics
        train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
        test_mape = mean_absolute_percentage_error(y_test, y_test_pred)
        
        train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)  # RMSE
        test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)    # RMSE
        
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        
        # Append the metrics for this fold
        train_mape_list.append(train_mape)
        test_mape_list.append(test_mape)
        train_rmse_list.append(train_rmse)
        test_rmse_list.append(test_rmse)
        train_r2_list.append(train_r2)
        test_r2_list.append(test_r2)
        
        # Store test predictions
        test_predictions.append(y_test_pred)
    
    # Calculate average of the metrics
    avg_train_mape = np.mean(train_mape_list)
    avg_test_mape = np.mean(test_mape_list)
    avg_train_rmse = np.mean(train_rmse_list)
    avg_test_rmse = np.mean(test_rmse_list)
    avg_train_r2 = np.mean(train_r2_list)
    avg_test_r2 = np.mean(test_r2_list)
    
    # Prepare the results as a DataFrame
    output_df = pd.DataFrame({
        "Code": ["Using only AST"],
        "Regressor": ["SVR"],
        "Train_R²": [avg_train_r2],
        "Train_RMSE": [avg_train_rmse],
        "Train_MAPE": [avg_train_mape],
        "Test_R²": [avg_test_r2],
        "Test_RMSE": [avg_test_rmse],
        "Test_MAPE": [avg_test_mape]
    })

    # Save the average results to CSV file
    output_file = os.path.join(output_dir, "svr_avg_results.csv")
    output_df.to_csv(output_file, index=False)

    # Report average results
    print(f"Results saved to {output_file}")
    
    # Concatenate all test predictions from each fold into a single array
    test_predictions = np.concatenate(test_predictions, axis=0)
    
    return {
        "Train_MAPE": avg_train_mape,
        "Test_MAPE": avg_test_mape,
        "Train_RMSE": avg_train_rmse,
        "Test_RMSE": avg_test_rmse,
        "Train_R²": avg_train_r2,
        "Test_R²": avg_test_r2
    }, test_predictions


In [56]:
SVR_results, data['SVR_predicted_value_code'] = run_svr_regressor_cv(code_embeddings, target)

Folds: 10it [00:01,  6.38it/s]

Results saved to results/svr_avg_results.csv





In [57]:
data.head()

Unnamed: 0.1,Unnamed: 0,Question,Correct_Code,Code_with_Error,Total_Marks,AST_full,RF_predicted_value_code,XGBR_predicted_value_code,KNN_predicted_value_code,CATBoost_predicted_value_code,SVR_predicted_value_code
0,0,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\nvoid printFactors(int numb...,7.0,CursorKind.FUNCTION_DECL printFactors\n Curso...,5.35,4.256089,5.2,5.603391,5.457184
1,1,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\nvoid printFactors(int numb...,8.0,CursorKind.FUNCTION_DECL printFactors\n Curso...,5.358,4.664847,5.4,5.306713,5.305394
2,2,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\nvoid printFactors(int numb...,5.0,CursorKind.FUNCTION_DECL printFactors\n Curso...,5.44,5.801123,4.8,4.549781,5.265496
3,3,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\n\nvoid printFactors(int nu...,7.0,CursorKind.FUNCTION_DECL printFactors\n Curso...,5.144783,5.269772,4.4,4.409132,5.826906
4,4,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\n\nvoid printFactors(int nu...,5.0,CursorKind.FUNCTION_DECL printFactors\n Curso...,6.04,7.046596,6.4,6.455247,5.928104


In [58]:
data = data[data['Total_Marks']>0]

In [61]:


# Define a function to calculate the evaluation metrics
def calculate_metrics(true_values, predicted_values):
    # R²
    r2 = 1 - (np.sum((true_values - predicted_values) ** 2) / np.sum((true_values - np.mean(true_values)) ** 2))
    
    # RMSE
    rmse = np.sqrt(np.mean((true_values - predicted_values) ** 2))
    
    # MAPE
    mape = np.mean(np.abs((true_values - predicted_values) / true_values)) * 100
    
    return r2, rmse, mape

# Assuming 'df' contains the true values and predictions
true_values = data["Total_Marks"].values

# Calculate metrics for each regressor
metrics = {}
for model in ['RF', 'XGBR', 'KNN', 'CATBoost', 'SVR']:
    predicted_values = data[f"{model}_predicted_value_code"].values
    r2, rmse, mape = calculate_metrics(true_values, predicted_values)
    metrics[model] = {"r_squared": r2, "rmse": rmse, "mape": mape}

# Create a DataFrame to store results in the desired format
result_df = pd.DataFrame(metrics).T

# Now adjust the format so that regressor names are in the first column
final_df = result_df[['r_squared', 'rmse', 'mape']].rename(columns={
    'r_squared': 'R²',
    'rmse': 'RMSE',
    'mape': 'MAPE'
})

# Display the final DataFrame
final_df['Regressor'] = final_df.index
final_df = final_df[['Regressor', 'R²', 'RMSE', 'MAPE']]

# Saving the result to CSV or display
final_df.to_csv("regressor_metrics.csv", index=False)
print(final_df)


         Regressor        R²      RMSE       MAPE
RF              RF -0.508959  2.737975  55.325608
XGBR          XGBR -0.719550  2.922794  57.933209
KNN            KNN -0.676626  2.886083  57.522456
CATBoost  CATBoost -0.601788  2.820936  56.655027
SVR            SVR -0.299060  2.540420  52.106045


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import numpy as np

# Define the BiLSTM model
class BiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(BiLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, 1)  # *2 for bidirectional

    def forward(self, x):
        # x: (batch_size, seq_len, input_dim)
        lstm_out, _ = self.lstm(x)  # lstm_out: (batch_size, seq_len, hidden_dim * 2)
        out = self.fc(lstm_out)  # out: (batch_size, seq_len, 1)
        return out

# Dataset class for the embeddings and targets
class CodeDataset(Dataset):
    def __init__(self, embeddings, targets):
        self.embeddings = embeddings
        self.targets = targets

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.targets[idx]


def train_bilstm(embeddings, targets, input_dim, hidden_dim, num_layers, num_epochs, batch_size, device):
    dataset = CodeDataset(embeddings, targets)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    model = BiLSTM(input_dim, hidden_dim, num_layers).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.MSELoss()

    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for batch_embeddings, batch_targets in dataloader:
            batch_embeddings = batch_embeddings.to(device)
            batch_targets = batch_targets.to(device).view(-1, 1)

            optimizer.zero_grad()

            batch_embeddings = batch_embeddings.view(batch_embeddings.size(0), -1, input_dim)
            outputs = model(batch_embeddings)  # [batch_size, seq_len, hidden_dim*2]
            embeddings_out = outputs.mean(dim=1)  # [batch_size, hidden_dim*2]

            loss = criterion(embeddings_out, batch_targets)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(dataloader)}")

    return model


# Main processing script
if __name__ == "__main__":
    # Assume code_embeddings has shape (num_samples, seq_len, embedding_dim)
    embeddings = loaded_embeddings['code_embeddings']
    total_marks = data['Total_Marks'].values

    # # Ensure embeddings and targets have the same length
    # assert len(embeddings) == len(total_marks), "Embeddings and targets must have the same length."

    # Process the embeddings through BiLSTM
    processed_model = train_bilstm(
        embeddings=embeddings,
        targets=total_marks,
        input_dim=embeddings.shape[-1],  # embedding_dim
        hidden_dim=128,
        num_layers=2,
        num_epochs=10,
        batch_size=32,
        device="cuda" if torch.cuda.is_available() else "cpu"
    )

    # Optionally save the model or processed embeddings
    torch.save(processed_model.state_dict(), "bilstm_model.pth")
    print("Model saved to 'bilstm_model.pth'")


In [70]:
data.to_csv('data_with_regression_scores.csv')

In [75]:
print(f"Flattened embeddings length: {len(flattened_embeddings)}")
print(f"Total marks length: {len(total_marks)}")


Flattened embeddings length: 512000
Total marks length: 1000


In [76]:
code_embeddings.shape

(1, 1000, 1, 512, 768)

In [78]:
embeddings = embeddings.squeeze()  # This will remove all dimensions with size 1, making it (1000, 512, 768)
print(f"Flattened embeddings shape: {embeddings.shape}")


Flattened embeddings shape: torch.Size([1000, 512, 768])
