### Importing essential libraries

In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Bidirectional
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import train_test_split
import clang.cindex
import tempfile
from sklearn.model_selection import KFold
import logging
import re
# Set up logging
logging.basicConfig(level=logging.INFO)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Data file

In [2]:
#Reading data
data = pd.read_csv('Data_AST.csv')
data.head(5)

Unnamed: 0.1,Unnamed: 0,Question,Correct_Code,Code_with_Error,Total_Marks,AST_full
0,0,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\nvoid printFactors(int numb...,7.0,CursorKind.FUNCTION_DECL printFactors\n Curso...
1,1,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\nvoid printFactors(int numb...,8.0,CursorKind.FUNCTION_DECL printFactors\n Curso...
2,2,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\nvoid printFactors(int numb...,5.0,CursorKind.FUNCTION_DECL printFactors\n Curso...
3,3,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\n\nvoid printFactors(int nu...,7.0,CursorKind.FUNCTION_DECL printFactors\n Curso...
4,4,Print the factors of a number,#include <stdio.h>\nvoid printFactors(int numb...,#include <stdio.h>\n\nvoid printFactors(int nu...,5.0,CursorKind.FUNCTION_DECL printFactors\n Curso...


### Code to full AST and partial AST

In [None]:
####IMPLEMENTATION OF C CODE PARSING USING SIMPLE AST

import clang.cindex
import tempfile

def parse_c_code_from_string(c_code):

    # Create a temporary file to store the C code string
    with tempfile.NamedTemporaryFile(suffix=".c", delete=False) as tmp_file:
        tmp_file.write(c_code.encode())
        tmp_file.flush()
        tmp_file_path = tmp_file.name

    index = clang.cindex.Index.create()
    translation_unit = index.parse(tmp_file_path)

    # Function to recursively build the AST string representation
    def build_ast_string(node, indent=0):
        if node.kind == clang.cindex.CursorKind.TRANSLATION_UNIT:
            ast_str = ""
            for child in node.get_children():
                ast_str += build_ast_string(child, indent)
            return ast_str
        else:
            ast_str = '  ' * indent + f"{node.kind}\n"
            for child in node.get_children():
                ast_str += build_ast_string(child, indent + 1)
            return ast_str

    # Build the AST string starting from the root cursor
    ast_representation = build_ast_string(translation_unit.cursor)

    return ast_representation

data['AST_full'] = data['Code_with_Error'].apply(parse_c_code_from_string)


In [None]:
####IMPLEMENTATION OF C CODE PARSING USING PARTIAL AST

import clang.cindex
import tempfile

def find_parent_manually(root, target_node):
        for child in root.get_children():
            # Check if the target_node is among the children
            if any(grandchild == target_node for grandchild in child.get_children()):
                return child
            # Recursively search in deeper children
            parent = find_parent_manually(child, target_node)
            if parent:
                return parent
        return None

def parse_c_code_from_string(c_code):
    # Create a temporary file to store the C code string
    with tempfile.NamedTemporaryFile(suffix=".c", delete=False) as tmp_file:
        tmp_file.write(c_code.encode())
        tmp_file.flush()
        tmp_file_path = tmp_file.name

    # Create an index for parsing
    index = clang.cindex.Index.create()

    # Parse the C code from the temporary file
    translation_unit = index.parse(tmp_file_path)

    # Function to get original code lines for display
    original_lines = c_code.splitlines()

    # Function to find siblings using Clang's built-in functionality
    def find_next_brother(node):
        parent = find_parent_manually(translation_unit.cursor, node)
        if not parent:
            return None

        siblings = list(parent.get_children())
        current_index = siblings.index(node)

        # Return the next sibling if it exists
        if current_index + 1 < len(siblings):
            return siblings[current_index + 1]
        else:
            return find_next_brother(parent)

     # Function to find if right sibling exist
    def right_sibling_exist(node):
        parent = find_parent_manually(translation_unit.cursor, node)
        if not parent:
            return None

        siblings = list(parent.get_children())
        current_index = siblings.index(node)

        # Return the next sibling if it exists
        if current_index + 1 < len(siblings):
            return True
        else:
            return False


    node_same_line=[]
    # Recursive function to build AST string, printing errors and original lines when needed
    def build_ast_string(node, indent=0):
        ast_str = ''
        # Check diagnostics at the current node
        #for d in translation_unit.diagnostics:
            #print(f"Line: {d.location.line}, Column: {d.location.column}, Severity: {d.severity}, Message: {d.spelling}")

        diagnostics =[]
        diagnostics_1=[]

        for d in translation_unit.diagnostics:
            if d.location.line == node.location.line and node.location.line not in node_same_line:
                diagnostics.append(d)

            elif d.location.line == node.location.line+1:
                diagnostics_1.append(d)


        if diagnostics and len(list(node.get_children()))==0 and not right_sibling_exist(node):
            # If there's a diagnostic, print the error and the code causing it
            ast_str += '  ' * indent + f"{node.kind} {node.spelling}\n"
            ast_str += '  ' * indent + f"{original_lines[node.location.line - 1]}\n"
            #This list ensures that if the error exist in the same line, the code is printed only once and not for all it's childen
            node_same_line.append(node.location.line)
        else:
            # Continue building the AST normally
            ast_str += '  ' * indent + f"{node.kind} {node.spelling}\n"

        # Find the next sibling to check lines between the current node and its sibling
        sibling = find_next_brother(node)

        # Check and print code lines between the current node and its sibling
        if sibling and sibling.location.line - node.location.line > 1:
            print(sibling.location.line - node.location.line)
            # Print lines that are between the current node and its sibling
            for line_num in range(node.location.line, sibling.location.line - 1):
                line = original_lines[line_num]
                if line.strip() and not all(char in ')}] ' for char in line):
                    print("in")
                    ast_str += '  ' * indent + f"{line}\n"

        elif sibling is None and len(original_lines)-node.location.line > 1 and  len(list(node.get_children()))==0:
            print("in")
            # Print lines that are between the current node and its sibling
            for line_num in range(node.location.line, len(original_lines) - 1):
                line = original_lines[line_num]
                if line.strip() and not all(char in '(){}[] ' for char in line):
                    ast_str += '  ' * indent + f"{line}\n"

        # Recursively traverse and build AST for child nodes
        for child in node.get_children():
            ast_str += build_ast_string(child, indent + 1)

        return ast_str

    # Build the AST string starting from the root cursor
    ast_representation = build_ast_string(translation_unit.cursor)

    return ast_representation



### Data preprocessing

In [3]:
def clean_text_code(text):
        #Preprocesses text by removing blank lines, reducing whitespace, and replacing newlines with spaces.
        text = '\n'.join(line for line in text.split('\n') if line.strip())  # Remove blank lines
        text = re.sub(r'\s{2,}', ' ', text)  # Replace multiple spaces with one
        return text.replace('\n', ' ')  # Replace newlines with spaces

"""
def clean_text_AST(text):
    text=str(text)
    # Split the input into lines while preserving the structure
    text = text.replace('CursorKind.', '')
    lines = text.splitlines()
    cleaned_lines = []

    for line in lines:

        leading_spaces = len(line) - len(line.lstrip())
        stripped_line = line.lstrip()

        # Replace 'CursorKind.' and process each word
        words = stripped_line.split()
        cleaned_words = []


        for word in words:
            if '_' in word:
                parts = word.split('_')
                cleaned_word = parts[0][:3]  # First 3 letters before the '_'

                if len(parts) > 1:
                    cleaned_word += parts[1][:3]  # Next 3 letters after the '_'

                if len(parts) > 2:
                    cleaned_word += parts[2][:2]  # Take 2 letters after the second '_'

                cleaned_words.append(cleaned_word)
            else:
                cleaned_words.append(word[:4])  # Take the first 4 letters if no '_'

        # Rejoin the cleaned words into a single line
        cleaned_line = ' '.join(cleaned_words)

        # Reapply the leading spaces (indentation) to the line
        cleaned_lines.append(' ' * leading_spaces + cleaned_line)

    # Join the cleaned lines back into a single string, preserving newlines
    cleaned_text = '\n'.join(cleaned_lines)
    text = re.sub(r'\s{2,}', ' ', cleaned_text)  # Replace multiple spaces with one
    return text.replace('\n', ' ')
"""

#data['AST_full_Processed']=data['AST_full'].apply(clean_text_AST)
data['Code_with_Error_Processed']=data['Code_with_Error'].apply(clean_text_code)

### CodeBERT embedding class

In [8]:
import torch
import gc
import logging
from transformers import AutoTokenizer, AutoModel

class CodeT5Embeddings:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")
        self.model = AutoModel.from_pretrained("Salesforce/codet5-base").to(self.device)
          # Add BiLSTM
        self.bilstm = torch.nn.LSTM(
            input_size=768,  # CodeT5's embedding size
            hidden_size=512,  # BiLSTM hidden size
            num_layers=1,  # Number of LSTM layers
            bidirectional=True,  # BiLSTM for forward and backward passes
            batch_first=True  # Input shape (batch_size, seq_length, input_size)
        ).to(self.device)
        
    # def forward(self, inputs):
    #     # LSTM forward pass
    #     lstm_output, (hidden_state, cell_state) = self.bilstm(inputs)
    #     return lstm_output

    # Function to chunk the input text into 512-token segments
    def chunk_text(self, text, max_length=512):
        # Tokenize the entire text first
        tokens = self.tokenizer(text, return_tensors="pt", truncation=False, padding=False)

        input_ids = tokens['input_ids'][0]  # The tokenized sequence
        chunks = [input_ids[i:i + max_length] for i in range(0, len(input_ids), max_length)]

        return chunks


    def _get_embeddings(self, text_batch):
        if not text_batch:  # Handle empty input
            logging.warning(f"Empty input text batch provided.")
            return torch.zeros(0, 768).to(self.device)

        embeddings_per_text = []

        # Step 1: Process each text separately
        for text in text_batch:
            # Step 2: Tokenize and chunk the text
            chunked_text = self.chunk_text(text)


            # Step 4: Run the model on each chunk separately
            all_chunk_embeddings = []
            with torch.no_grad():
                outputs = self.model.encoder(**chunked_text)  # Model inference
                #chunk_embeddings = outputs.last_hidden_state[:, 0, :]  # Get embeddings for [CLS] token of each chunk
                all_chunk_embeddings.append(outputs.last_hidden_state)
            # Concatenate all chunks into a single tensor for LSTM
            concatenated_embeddings = torch.cat(all_chunk_embeddings, dim=1)  # (1, seq_len, 768)
            BiLSTM, _  = self.bilstm(all_chunk_embeddings.unsqueeze(0))

            # Step 5: Concatenate the embeddings for all chunks of the text
            lstm_output, _ = self.bilstm(concatenated_embeddings)

            embeddings_per_text.append(lstm_output)

        # Clear cache after processing the embedding
        del tokens, outputs
        torch.cuda.empty_cache()
        gc.collect()

        return embeddings_per_text #Array of arrays containing each array of 512 tokens embedded using T5

    def process_embeddings_in_batches(self, texts, batch_size=16):

        all_embeddings = []

        # Divide the list into batches of size `batch_size`
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            # Get embeddings for the current batch
            batch_embeddings = self._get_embeddings(batch_texts)
            all_embeddings.append(batch_embeddings)

        # Combine embeddings from all batches
        final_embeddings = torch.cat(all_embeddings, dim=0)

        # Clear memory
        del all_embeddings, batch_embeddings
        torch.cuda.empty_cache()
        gc.collect()

        return final_embeddings

    def process(self, code, ast_full):
        code_embedding = self.process_embeddings_in_batches(code)
        ast_embedding = self.process_embeddings_in_batches(ast_full)

        # More cache clearing after each embedding
        torch.cuda.empty_cache()
        gc.collect()

        return code_embedding, ast_embedding


class CodeT5Processor:
    def __init__(self, dataframe, embedding_model, embedding_save_path="embeddings.pt"):
        self.dataframe = dataframe
        self.embedding_model = embedding_model
        self.embedding_save_path = embedding_save_path
        self.embeddings = None  # To store the embeddings

    def compute_embeddings(self):
        code_embeddings = []
        ast_embeddings = []
        full_code = []
        full_ast = []
        updated_rows = []
        i=0
        for idx, row in self.dataframe.iterrows():

            code = row['Code_with_Error_Processed']
            ast = row['AST_full_Processed']

            # Check if either tokenized code or AST exceeds the length of 512 tokens
            code_tokens = self.embedding_model.tokenizer(code, return_tensors="pt")['input_ids']
            ast_tokens = self.embedding_model.tokenizer(ast, return_tensors="pt")['input_ids']

            # Log and skip rows where token size is greater than 512 for either code or AST
            if len(code_tokens[0]) > 512 or len(ast_tokens[0]) > 512:
                i+=1
                continue

            full_code.append(str(code))
            full_ast.append(str(ast))
            updated_rows.append(row.to_dict())

        new_dataframe = pd.DataFrame(updated_rows)
        new_dataframe.to_csv('updated_data.csv', index=False)
        print(f"Number of rows skipped due to token size exceeding 512: {i}")

        # Compute embeddings for code and AST
        code_embedding, ast_embedding = self.embedding_model.process(full_code, full_ast)

        # Append to lists and clear memory after processing each
        code_embeddings.append(code_embedding.cpu())
        ast_embeddings.append(ast_embedding.cpu())
        torch.cuda.empty_cache()
        gc.collect()

        # Store the embeddings
        self.embeddings = {
            'code_embeddings': torch.stack(code_embeddings),
            'ast_embeddings': torch.stack(ast_embeddings)
        }

    def save_embeddings(self):
        if self.embeddings is None:
            raise ValueError("No embeddings found. Please compute embeddings first.")
        torch.save(self.embeddings, self.embedding_save_path)
        print(f"Embeddings saved to {self.embedding_save_path}")

    def load_embeddings(self):
        self.embeddings = torch.load(self.embedding_save_path)
        print(f"Embeddings loaded from {self.embedding_save_path}")

    def get_embeddings(self):
        if self.embeddings is None:
            raise ValueError("No embeddings found. Please compute or load embeddings first.")
        return self.embeddings


In [58]:

tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")
# Tokenization length check
def count_exceeding_rows(column, max_tokens=1024):
    exceeding_rows = 0
    for row in column:

        tokenized_length = len(tokenizer(str(row), return_tensors="pt")['input_ids'][0])
        if tokenized_length > max_tokens:
            exceeding_rows += 1
    return exceeding_rows

# Check how many rows exceed 512 tokens
rows_exceeding = count_exceeding_rows(data['AST_full'], max_tokens=1024)
print(f"Number of rows with more than 512 tokens: {rows_exceeding}")

Token indices sequence length is longer than the specified maximum sequence length for this model (731 > 512). Running this sequence through the model will result in indexing errors


Number of rows with more than 512 tokens: 317


In [53]:
print(data['AST_full'][0])

CursorKind.FUNCTION_DECL
  CursorKind.PARM_DECL
  CursorKind.COMPOUND_STMT
    CursorKind.CALL_EXPR
      CursorKind.UNEXPOSED_EXPR
        CursorKind.DECL_REF_EXPR
      CursorKind.UNEXPOSED_EXPR
        CursorKind.UNEXPOSED_EXPR
          CursorKind.STRING_LITERAL
      CursorKind.UNEXPOSED_EXPR
        CursorKind.DECL_REF_EXPR
    CursorKind.FOR_STMT
      CursorKind.DECL_STMT
        CursorKind.VAR_DECL
          CursorKind.INTEGER_LITERAL
      CursorKind.BINARY_OPERATOR
        CursorKind.UNEXPOSED_EXPR
          CursorKind.DECL_REF_EXPR
        CursorKind.UNEXPOSED_EXPR
          CursorKind.DECL_REF_EXPR
      CursorKind.UNARY_OPERATOR
        CursorKind.DECL_REF_EXPR
      CursorKind.COMPOUND_STMT
        CursorKind.IF_STMT
          CursorKind.BINARY_OPERATOR
            CursorKind.BINARY_OPERATOR
              CursorKind.UNEXPOSED_EXPR
                CursorKind.DECL_REF_EXPR
              CursorKind.UNEXPOSED_EXPR
                CursorKind.DECL_REF_EXPR
            CursorKi

### Main class

In [24]:
#Embedding extraction
embedding_model = CodeT5Embeddings()
processor = CodeT5Processor(data, embedding_model)
processor.compute_embeddings()
processor.save_embeddings()
processor.load_embeddings()
embeddings = processor.get_embeddings()

print(embeddings['code_embeddings'])
print(embeddings['ast_embeddings'])

Token indices sequence length is longer than the specified maximum sequence length for this model (567 > 512). Running this sequence through the model will result in indexing errors


Number of rows skipped due to token size exceeding 512: 133
Embeddings saved to embeddings.pt
Embeddings loaded from embeddings.pt
tensor([[[ 0.0041, -0.2031, -0.1294,  ..., -0.0496, -0.0128, -0.1429],
         [-0.0053, -0.2308, -0.1260,  ..., -0.0506, -0.0143, -0.1440],
         [ 0.0149, -0.2211, -0.1231,  ..., -0.0525, -0.0129, -0.1247],
         ...,
         [ 0.0077, -0.0115, -0.0123,  ..., -0.0076,  0.0037, -0.0278],
         [ 0.0019, -0.0086, -0.0080,  ..., -0.0049,  0.0005, -0.0138],
         [-0.0029, -0.2452, -0.1435,  ..., -0.0112, -0.0482, -0.1252]]])
tensor([[[ 1.9006e-03, -5.4137e-03,  2.2087e-03,  ...,  2.9823e-03,
           8.3012e-04, -8.0320e-03],
         [ 1.8753e-03, -5.2868e-03,  1.9338e-03,  ...,  2.6454e-03,
           6.4081e-04, -7.6128e-03],
         [ 1.4670e-03, -5.2749e-03,  2.4984e-03,  ...,  3.0852e-03,
          -1.5118e-05, -8.5196e-03],
         ...,
         [ 1.4230e-03, -3.4453e-03,  6.6530e-04,  ...,  3.8998e-03,
           3.5129e-04, -7.6585

  self.embeddings = torch.load(self.embedding_save_path)


In [21]:
len(embeddings['code_embeddings'][0])

867

In [19]:
len(embeddings['ast_embeddings'][0])

867

In [26]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

updated_data = pd.read_csv('updated_data.csv')
print(len(updated_data['Total_Marks']))
# Embeddings
code_embeddings = np.array(embeddings['code_embeddings'][0])
ast_embeddings = np.array(embeddings['ast_embeddings'][0])

# Ground truth scores
y = updated_data['Total_Marks'].values

def calculate_metrics(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)  # Root Mean Squared Error
    r2 = r2_score(y_true, y_pred)  # R-squared
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100  # Mean Absolute Percentage Error
    return rmse, r2, mape


########## CODE TRAINING ############

X_train, X_test, y_train, y_test = train_test_split(code_embeddings, y, test_size=0.2, random_state=42)

# Train XGBoost model
xg_reg_code = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6
)
xg_reg_code.fit(X_train, y_train)

y_pred_code = xg_reg_code.predict(X_test)
rmse_code, r2_code, mape_code = calculate_metrics(y_test, y_pred_code)

print(f"Code Embeddings - RMSE: {rmse_code:.4f}, R²: {r2_code:.4f}, MAPE: {mape_code:.4f}%")


############ AST TRAINING ############


X_train, X_test, y_train, y_test = train_test_split(ast_embeddings, y, test_size=0.2, random_state=42)
xg_reg_ast = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6
)
xg_reg_ast.fit(X_train, y_train)


y_pred_ast = xg_reg_ast.predict(X_test)
rmse_ast, r2_ast, mape_ast = calculate_metrics(y_test, y_pred_ast)

print(f"AST Embeddings - RMSE: {rmse_ast:.4f}, R²: {r2_ast:.4f}, MAPE: {mape_ast:.4f}%")



867


  mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100  # Mean Absolute Percentage Error


Code Embeddings - RMSE: 1.8038, R²: 0.3255, MAPE: inf%
AST Embeddings - RMSE: 2.1036, R²: 0.0827, MAPE: inf%


  mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100  # Mean Absolute Percentage Error


In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

# Load updated data
updated_data = pd.read_csv('updated_data.csv')
print(len(updated_data['Total_Marks']))

# Embeddings
code_embeddings = np.array(embeddings['code_embeddings'])  # Note: embedding for all rows
ast_embeddings = np.array(embeddings['ast_embeddings'])  # Note: embedding for all rows

# Ground truth scores
y = updated_data['Total_Marks'].values

# Initialize lists to store results
code_preds = []
ast_preds = []

# Define the cross-validation strategy (10-fold)
kf = KFold(n_splits=10, shuffle=True, random_state=42)

def calculate_metrics(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)  # Root Mean Squared Error
    r2 = r2_score(y_true, y_pred)  # R-squared
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100  # Mean Absolute Percentage Error
    return rmse, r2, mape

# Code Embeddings - 10-fold CV
for train_index, test_index in kf.split(code_embeddings[0]):
    X_train, X_test = code_embeddings[train_index], code_embeddings[test_index]
    y_train, y_test = y[train_index], y[test_index]

    xg_reg_code = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=6)
    xg_reg_code.fit(X_train, y_train)

    y_pred_code = xg_reg_code.predict(X_test)
    rmse_code, r2_code, mape_code = calculate_metrics(y_test, y_pred_code)

    print(f"Code Embeddings - RMSE: {rmse_code:.4f}, R²: {r2_code:.4f}, MAPE: {mape_code:.4f}%")

    # Store predictions for later use
    code_preds.extend(y_pred_code)

# AST Embeddings - 10-fold CV
for train_index, test_index in kf.split(ast_embeddings):
    X_train, X_test = ast_embeddings[train_index], ast_embeddings[test_index]
    y_train, y_test = y[train_index], y[test_index]

    xg_reg_ast = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=6)
    xg_reg_ast.fit(X_train, y_train)

    y_pred_ast = xg_reg_ast.predict(X_test)
    rmse_ast, r2_ast, mape_ast = calculate_metrics(y_test, y_pred_ast)

    print(f"AST Embeddings - RMSE: {rmse_ast:.4f}, R²: {r2_ast:.4f}, MAPE: {mape_ast:.4f}%")

    # Store predictions for later use
    ast_preds.extend(y_pred_ast)

# After 10-fold CV, save the results back into the DataFrame
updated_data['Predicted_Code_Score'] = code_preds
updated_data['Predicted_AST_Score'] = ast_preds

# Save the updated DataFrame with predictions
updated_data.to_csv('updated_data_with_predictions.csv', index=False)
print("Predictions have been added to the DataFrame and saved.")


867


IndexError: index 1 is out of bounds for axis 0 with size 1

In [38]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

# Load updated data
updated_data = pd.read_csv('updated_data.csv')

# Embeddings for both code and AST (assuming these are loaded as 3D arrays)
code_embeddings = np.array(embeddings['code_embeddings'][0])
ast_embeddings = np.array(embeddings['ast_embeddings'][0])

# Ground truth scores
y = updated_data['Total_Marks'].values

# Reshaping embeddings to 2D (flatten sequence dimension for XGBoost)
code_embeddings_flat = code_embeddings.reshape(code_embeddings.shape[0], -1)  # Flattening the sequence
ast_embeddings_flat = ast_embeddings.reshape(ast_embeddings.shape[0], -1)  # Flattening the sequence

# Initialize KFold cross-validation with 10 splits
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Prepare to store the predictions
predictions_code = []
predictions_ast = []
y_true_all = []  # To store all true values

# Define the evaluation metric function
def calculate_metrics(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)  # Root Mean Squared Error
    r2 = r2_score(y_true, y_pred)  # R-squared
    #mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100  # Mean Absolute Percentage Error
    return rmse, r2#, mape

# Perform 10-fold cross-validation for both embeddings (code and AST)
for fold, (train_index, test_index) in enumerate(kf.split(code_embeddings_flat)):
    print(f"Processing fold {fold + 1}...")

    # Splitting data into training and testing sets
    X_train_code, X_test_code = code_embeddings_flat[train_index], code_embeddings_flat[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Training XGBoost model for code embeddings
    xg_reg_code = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=6)
    xg_reg_code.fit(X_train_code, y_train)

    # Predicting for code embeddings
    y_pred_code = xg_reg_code.predict(X_test_code)
    predictions_code.extend(y_pred_code)  # Collect predictions for the code model

    # Repeat the same for AST embeddings
    X_train_ast, X_test_ast = ast_embeddings_flat[train_index], ast_embeddings_flat[test_index]

    # Train XGBoost model for AST embeddings
    xg_reg_ast = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=6)
    xg_reg_ast.fit(X_train_ast, y_train)

    # Predicting for AST embeddings
    y_pred_ast = xg_reg_ast.predict(X_test_ast)
    predictions_ast.extend(y_pred_ast)  # Collect predictions for the AST model

    # Collect true values for later evaluation
    y_true_all.extend(y_test)

# Now calculate metrics after all folds are completed
rmse_code, r2_code, mape_code = calculate_metrics(y_true_all, predictions_code)
rmse_ast, r2_ast, mape_ast = calculate_metrics(y_true_all, predictions_ast)

print(f"Final Evaluation - Code Embeddings: RMSE: {rmse_code:.4f}, R²: {r2_code:.4f}, MAPE: {mape_code:.4f}%")
print(f"Final Evaluation - AST Embeddings: RMSE: {rmse_ast:.4f}, R²: {r2_ast:.4f}, MAPE: {mape_ast:.4f}%")

# Save predictions as new columns in the dataframe
updated_data['predicted_code'] = predictions_code
updated_data['predicted_ast'] = predictions_ast

# Save the updated dataframe with predictions
updated_data.to_csv('updated_data_with_predictions.csv', index=False)


Processing fold 1...
Processing fold 2...
Processing fold 3...
Processing fold 4...
Processing fold 5...
Processing fold 6...
Processing fold 7...
Processing fold 8...
Processing fold 9...
Processing fold 10...




TypeError: unsupported operand type(s) for -: 'list' and 'list'

In [47]:
# Define the evaluation metric function
def calculate_metrics(y_true, y_pred):
    # Filter out rows where true value is 0 to avoid MAPE going to infinity
    mask = y_true != 0
    y_true_filtered = y_true[mask]
    y_pred_filtered = y_pred[mask]

    rmse = mean_squared_error(y_true_filtered, y_pred_filtered, squared=False)  # Root Mean Squared Error
    r2 = r2_score(y_true_filtered, y_pred_filtered)  # R-squared
    mape = np.mean(np.abs((y_true_filtered - y_pred_filtered) / y_true_filtered)) * 100  # Mean Absolute Percentage Error
    return rmse, r2, mape

y_true_all = np.array(y_true_all)
predictions_code = np.array(predictions_code)
predictions_ast = np.array(predictions_ast)

# Now calculate metrics after all folds are completed
rmse_code, r2_code, mape_code = calculate_metrics(y_true_all, predictions_code)
rmse_ast, r2_ast, mape_ast = calculate_metrics(y_true_all, predictions_ast)

print(f"Final Evaluation - Code Embeddings: RMSE: {rmse_code:.4f}, R²: {r2_code:.4f}, MAPE: {mape_code:.4f}%")
print(f"Final Evaluation - AST Embeddings: RMSE: {rmse_ast:.4f}, R²: {r2_ast:.4f}, MAPE: {mape_ast:.4f}%")

Final Evaluation - Code Embeddings: RMSE: 1.8706, R²: 0.2625, MAPE: 33.8853%
Final Evaluation - AST Embeddings: RMSE: 2.0389, R²: 0.1239, MAPE: 37.5442%




In [44]:
len(y_true_all)

867

In [45]:
len(predictions_code)

867