In [1]:
import pandas as pd
from radon.metrics import mi_visit, mi_rank
from radon.complexity import cc_visit
from radon.raw import analyze

In [2]:
def compute_metrics(source: str):
    """Return MI, CC, LOC for given Python source code string."""
    if not isinstance(source, str) or not source.strip():
        return 0.0, 0, 0  # fallback for empty code
    
    try:
        # Maintainability Index
        mi_score = mi_visit(source, True)
        
        # Cyclomatic Complexity (take sum of all blocks)
        cc_blocks = cc_visit(source)
        cc_score = sum(block.complexity for block in cc_blocks)
        
        # LOC
        raw = analyze(source)
        loc = raw.loc
        
        return mi_score, cc_score, loc
    except Exception as e:
        # In case of syntax errors or non-Python files
        return 0.0, 0, 0

# Load your dataset

In [5]:
df = pd.read_csv("big_commits_with_rectifier.csv")
df.head()


Unnamed: 0,Hash,Message,Filename,Source Code (before),Source Code (current),Diff,LLM Inference,Rectifier
0,50038c07c815b82d412af43996a122e33eecd385,Matthew Johnson: fix build file issues,build/build_jax.sh,#!/bin/bash\nset -exv\n\n# For a build with CU...,#!/bin/bash\nset -exv\n\n# For a build with CU...,diff --git a/build/build_jax.sh b/build/build_...,Deadlock / livelock,Move TF_NCCL_VERSION assignment inside CUDA co...
1,50038c07c815b82d412af43996a122e33eecd385,Matthew Johnson: fix build file issues,examples/BUILD,# Copyright 2018 Google LLC\n#\n# Licensed und...,# Copyright 2018 Google LLC\n#\n# Licensed und...,diff --git a/examples/BUILD b/examples/BUILD\n...,Exception handling / swallowed exceptions,Update dependencies for mnist_vae.py to use //...
2,50038c07c815b82d412af43996a122e33eecd385,Matthew Johnson: fix build file issues,jax/BUILD,# Copyright 2018 Google LLC\n#\n# Licensed und...,# Copyright 2018 Google LLC\n#\n# Licensed und...,diff --git a/jax/BUILD b/jax/BUILD\nindex cddf...,Performance regression,Add scipy stats module to jax py_library sources
3,50038c07c815b82d412af43996a122e33eecd385,Matthew Johnson: fix build file issues,setup.py,# Copyright 2018 Google LLC\n#\n# Licensed und...,# Copyright 2018 Google LLC\n#\n# Licensed und...,diff --git a/setup.py b/setup.py\nindex 2b2ebc...,Documentation / comment mismatch,Add absl-py to install requirements in setup.py
4,50038c07c815b82d412af43996a122e33eecd385,Matthew Johnson: fix build file issues,tests/BUILD,# Copyright 2018 Google LLC\n#\n# Licensed und...,# Copyright 2018 Google LLC\n#\n# Licensed und...,diff --git a/tests/BUILD b/tests/BUILD\nindex ...,Exception handling / swallowed exceptions,Remove 'tests/' prefix from test file paths an...


In [6]:

# Prepare columns
df["MI_Before"], df["CC_Before"], df["LOC_Before"] = zip(
    *df["Source Code (before)"].map(compute_metrics)
)
df["MI_After"], df["CC_After"], df["LOC_After"] = zip(
    *df["Source Code (current)"].map(compute_metrics)
)

# Compute changes
df["MI_Change"] = df["MI_After"] - df["MI_Before"]
df["CC_Change"] = df["CC_After"] - df["CC_Before"]
df["LOC_Change"] = df["LOC_After"] - df["LOC_Before"]

# Save updated dataset
output_csv = "big_commits_with_metrics.csv"
df.to_csv(output_csv, index=False)
print(f"✅ Done. Saved {output_csv}")

✅ Done. Saved big_commits_with_metrics.csv


In [7]:
df.head()

Unnamed: 0,Hash,Message,Filename,Source Code (before),Source Code (current),Diff,LLM Inference,Rectifier,MI_Before,CC_Before,LOC_Before,MI_After,CC_After,LOC_After,MI_Change,CC_Change,LOC_Change
0,50038c07c815b82d412af43996a122e33eecd385,Matthew Johnson: fix build file issues,build/build_jax.sh,#!/bin/bash\nset -exv\n\n# For a build with CU...,#!/bin/bash\nset -exv\n\n# For a build with CU...,diff --git a/build/build_jax.sh b/build/build_...,Deadlock / livelock,Move TF_NCCL_VERSION assignment inside CUDA co...,0.0,0,0,0.0,0,0,0.0,0,0
1,50038c07c815b82d412af43996a122e33eecd385,Matthew Johnson: fix build file issues,examples/BUILD,# Copyright 2018 Google LLC\n#\n# Licensed und...,# Copyright 2018 Google LLC\n#\n# Licensed und...,diff --git a/examples/BUILD b/examples/BUILD\n...,Exception handling / swallowed exceptions,Update dependencies for mnist_vae.py to use //...,100.0,0,46,100.0,0,46,0.0,0,0
2,50038c07c815b82d412af43996a122e33eecd385,Matthew Johnson: fix build file issues,jax/BUILD,# Copyright 2018 Google LLC\n#\n# Licensed und...,# Copyright 2018 Google LLC\n#\n# Licensed und...,diff --git a/jax/BUILD b/jax/BUILD\nindex cddf...,Performance regression,Add scipy stats module to jax py_library sources,100.0,0,65,100.0,0,66,0.0,0,1
3,50038c07c815b82d412af43996a122e33eecd385,Matthew Johnson: fix build file issues,setup.py,# Copyright 2018 Google LLC\n#\n# Licensed und...,# Copyright 2018 Google LLC\n#\n# Licensed und...,diff --git a/setup.py b/setup.py\nindex 2b2ebc...,Documentation / comment mismatch,Add absl-py to install requirements in setup.py,100.0,0,30,100.0,0,30,0.0,0,0
4,50038c07c815b82d412af43996a122e33eecd385,Matthew Johnson: fix build file issues,tests/BUILD,# Copyright 2018 Google LLC\n#\n# Licensed und...,# Copyright 2018 Google LLC\n#\n# Licensed und...,diff --git a/tests/BUILD b/tests/BUILD\nindex ...,Exception handling / swallowed exceptions,Remove 'tests/' prefix from test file paths an...,100.0,0,94,100.0,0,94,0.0,0,0


In [9]:
import torch
from transformers import AutoTokenizer, AutoModel
from sacrebleu.metrics import BLEU
from sklearn.metrics.pairwise import cosine_similarity

# Load CodeBERT for embeddings
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

In [12]:
# BLEU scorer
bleu = BLEU()

def get_embedding(text: str):
    if not isinstance(text, str) or not text.strip():
        return torch.zeros(768)  # empty embedding
    
    tokens = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**tokens)
        # Mean pooling over token embeddings
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings

def semantic_similarity(code_before, code_after):
    emb1 = get_embedding(code_before)
    emb2 = get_embedding(code_after)
    if emb1.sum() == 0 or emb2.sum() == 0:
        return 0.0
    return cosine_similarity(emb1.reshape(1, -1), emb2.reshape(1, -1))[0][0]

def token_similarity(code_before, code_after):
    if not isinstance(code_before, str) or not isinstance(code_after, str):
        return 0.0
    if not code_before.strip() or not code_after.strip():
        return 0.0
    
    # sacrebleu expects strings, not lists of tokens
    score = bleu.sentence_score(code_after, [code_before]).score
    return score / 100.0  # normalize BLEU to [0,1]



In [13]:
# Load your dataset
df = pd.read_csv("big_commits_with_metrics.csv")

# Compute similarities
df["Semantic_Similarity"] = df.apply(
    lambda row: semantic_similarity(row["Source Code (before)"], row["Source Code (current)"]),
    axis=1
)

df["Token_Similarity"] = df.apply(
    lambda row: token_similarity(row["Source Code (before)"], row["Source Code (current)"]),
    axis=1
)

# Save updated dataset
output_csv = "big_commits_with_similarity.csv"
df.to_csv(output_csv, index=False)
print(f"✅ Done. Saved {output_csv}")

It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is recommended to enable `effective_order` for sentence-level BLEU.
It is 

✅ Done. Saved big_commits_with_similarity.csv


In [14]:
# Define thresholds
SEMANTIC_THRESHOLD = 0.80
TOKEN_THRESHOLD = 0.75

# Classification functions
def classify_semantic(score):
    return "Minor" if score >= SEMANTIC_THRESHOLD else "Major"

def classify_token(score):
    return "Minor" if score >= TOKEN_THRESHOLD else "Major"

# Apply classification
df["Semantic_class"] = df["Semantic_Similarity"].apply(classify_semantic)
df["Token_class"] = df["Token_Similarity"].apply(classify_token)

# Agreement check
df["Classes_Agree"] = df.apply(
    lambda row: "YES" if row["Semantic_class"] == row["Token_class"] else "NO",
    axis=1
)

# Save updated dataset
output_csv = "big_commits_with_classes.csv"
df.to_csv(output_csv, index=False)
print(f"✅ Done. Saved {output_csv}")

✅ Done. Saved big_commits_with_classes.csv


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Hash                   46 non-null     object 
 1   Message                46 non-null     object 
 2   Filename               46 non-null     object 
 3   Source Code (before)   46 non-null     object 
 4   Source Code (current)  45 non-null     object 
 5   Diff                   45 non-null     object 
 6   LLM Inference          46 non-null     object 
 7   Rectifier              46 non-null     object 
 8   MI_Before              46 non-null     float64
 9   CC_Before              46 non-null     int64  
 10  LOC_Before             46 non-null     int64  
 11  MI_After               46 non-null     float64
 12  CC_After               46 non-null     int64  
 13  LOC_After              46 non-null     int64  
 14  MI_Change              46 non-null     float64
 15  CC_Chang

In [16]:
df.head()

Unnamed: 0,Hash,Message,Filename,Source Code (before),Source Code (current),Diff,LLM Inference,Rectifier,MI_Before,CC_Before,...,CC_After,LOC_After,MI_Change,CC_Change,LOC_Change,Semantic_Similarity,Token_Similarity,Semantic_class,Token_class,Classes_Agree
0,50038c07c815b82d412af43996a122e33eecd385,Matthew Johnson: fix build file issues,build/build_jax.sh,#!/bin/bash\nset -exv\n\n# For a build with CU...,#!/bin/bash\nset -exv\n\n# For a build with CU...,diff --git a/build/build_jax.sh b/build/build_...,Deadlock / livelock,Move TF_NCCL_VERSION assignment inside CUDA co...,0.0,0,...,0,0,0.0,0,0,1.0,0.995092,Minor,Minor,YES
1,50038c07c815b82d412af43996a122e33eecd385,Matthew Johnson: fix build file issues,examples/BUILD,# Copyright 2018 Google LLC\n#\n# Licensed und...,# Copyright 2018 Google LLC\n#\n# Licensed und...,diff --git a/examples/BUILD b/examples/BUILD\n...,Exception handling / swallowed exceptions,Update dependencies for mnist_vae.py to use //...,100.0,0,...,0,46,0.0,0,0,0.999887,0.966744,Minor,Minor,YES
2,50038c07c815b82d412af43996a122e33eecd385,Matthew Johnson: fix build file issues,jax/BUILD,# Copyright 2018 Google LLC\n#\n# Licensed und...,# Copyright 2018 Google LLC\n#\n# Licensed und...,diff --git a/jax/BUILD b/jax/BUILD\nindex cddf...,Performance regression,Add scipy stats module to jax py_library sources,100.0,0,...,0,66,0.0,0,1,0.999785,0.976498,Minor,Minor,YES
3,50038c07c815b82d412af43996a122e33eecd385,Matthew Johnson: fix build file issues,setup.py,# Copyright 2018 Google LLC\n#\n# Licensed und...,# Copyright 2018 Google LLC\n#\n# Licensed und...,diff --git a/setup.py b/setup.py\nindex 2b2ebc...,Documentation / comment mismatch,Add absl-py to install requirements in setup.py,100.0,0,...,0,30,0.0,0,0,0.999937,0.985884,Minor,Minor,YES
4,50038c07c815b82d412af43996a122e33eecd385,Matthew Johnson: fix build file issues,tests/BUILD,# Copyright 2018 Google LLC\n#\n# Licensed und...,# Copyright 2018 Google LLC\n#\n# Licensed und...,diff --git a/tests/BUILD b/tests/BUILD\nindex ...,Exception handling / swallowed exceptions,Remove 'tests/' prefix from test file paths an...,100.0,0,...,0,94,0.0,0,0,0.999826,0.927255,Minor,Minor,YES
