In [17]:
!pip install pydriller==2.6

Collecting pydriller==2.6
  Downloading PyDriller-2.6-py3-none-any.whl.metadata (1.3 kB)
Downloading PyDriller-2.6-py3-none-any.whl (33 kB)
Installing collected packages: pydriller
  Attempting uninstall: pydriller
    Found existing installation: PyDriller 1.15
    Uninstalling PyDriller-1.15:
      Successfully uninstalled PyDriller-1.15
Successfully installed pydriller-2.6


In [None]:
import csv
from pydriller import Repository


REPO_URL = "https://github.com/pallets/flask.git"
OUTPUT_CSV = "bug_fixing_commits.csv"


BUG_KEYWORDS = ["fix", "bug", "error", "issue", "patch"]

def is_bug_fixing_commit(message: str) -> bool:
    """Return True if commit message contains bug-fix keywords."""
    if message:
        msg_lower = message.lower()
        return any(keyword in msg_lower for keyword in BUG_KEYWORDS)
    return False

# Extrac Commits
with open(OUTPUT_CSV, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Hash", "Message", "Hashes of parents", "Is a merge commit?", "List of modified files"])

    for commit in Repository(REPO_URL).traverse_commits():
        if is_bug_fixing_commit(commit.msg):
            commit_hash = commit.hash
            commit_message = commit.msg
            parent_hashes = commit.parents
            is_merge = len(commit.parents) > 1
            modified_files = [mod.filename for mod in commit.modified_files]

            writer.writerow([commit_hash, commit_message, parent_hashes, is_merge, modified_files])

print(f"Bug-fixing commit details saved to {OUTPUT_CSV}")


✅ Bug-fixing commit details saved to bug_fixing_commits.csv


In [22]:
!pip install hf-xet

Collecting hf-xet
  Downloading hf_xet-1.1.7-cp37-abi3-win_amd64.whl.metadata (703 bytes)
Downloading hf_xet-1.1.7-cp37-abi3-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   ----------- ---------------------------- 0.8/2.8 MB 8.5 MB/s eta 0:00:01
   ---------------------------------------- 2.8/2.8 MB 8.6 MB/s  0:00:00
Installing collected packages: hf-xet
Successfully installed hf-xet-1.1.7


In [None]:
import csv
import os
import subprocess
from pydriller import Repository
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

REPO_URL = "https://github.com/pallets/flask.git"
LOCAL_REPO = "flask_repo"
DIFF_OUTPUT_CSV = "diff_analysis_with_llm.csv"

if not os.path.exists(LOCAL_REPO):
    print(f" Cloning {REPO_URL} into {LOCAL_REPO}...")
    subprocess.run(["git", "clone", REPO_URL, LOCAL_REPO], check=True)
else:
    print(f" Using existing repo: {LOCAL_REPO}")

device = "cuda" if torch.cuda.is_available() else "cpu"


model_name = "mamiksik/CommitPredictorT5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)


def llm_infer_fix_type(diff_text: str) -> str:
    if not diff_text.strip():
        return ""
    input_text = "summarize fix: " + diff_text[:2000]
    inputs = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = model.generate(inputs, max_length=64, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Rectification logic
def rectify_message(original_msg: str, llm_msg: str, diff_text: str) -> str:
    generic_msgs = ["fix bug", "minor fix", "bug fix", "fixes", "fixed", "fix issue"]
    keywords = []
    diff_lower = diff_text.lower()

    if "def " in diff_text or "function" in diff_lower:
        keywords.append("function")
    if "class " in diff_text:
        keywords.append("class")
    if "test" in diff_lower:
        keywords.append("test")
    if "typo" in diff_lower:
        keywords.append("typo")

    if any(g in original_msg.lower().strip() for g in generic_msgs):
        return llm_msg.strip()

    rectified = original_msg.strip()
    if llm_msg and llm_msg.lower() not in rectified.lower():
        rectified += " | " + llm_msg.strip()
    for kw in keywords:
        if kw not in rectified.lower():
            rectified += f" ({kw})"
    return rectified


with open(DIFF_OUTPUT_CSV, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow([
        "Hash",
        "Message",
        "Filename",
        "Source Code (before)",
        "Source Code (current)",
        "Diff",
        "LLM Inference (fix type)",
        "Rectified Message"
    ])

    for commit in Repository(LOCAL_REPO).traverse_commits():
        try:
            if is_bug_fixing_commit(commit.msg):
                for mod in commit.modified_files:
                    filename = mod.filename
                    source_before = mod.source_code_before or ""
                    source_current = mod.source_code or ""
                    diff_text = mod.diff or ""

                    llm_msg = llm_infer_fix_type(diff_text)
                    rectified = rectify_message(commit.msg, llm_msg, diff_text)

                    writer.writerow([
                        commit.hash,
                        commit.msg,
                        filename,
                        source_before,
                        source_current,
                        diff_text,
                        llm_msg,
                        rectified
                    ])
        except Exception as e:
            print(f" Skipping commit {getattr(commit, 'hash', 'UNKNOWN')} due to error: {e}")
            continue

print(f" Diff analysis with LLM & rectifier saved to {DIFF_OUTPUT_CSV}")


✅ Using existing repo: flask_repo
⚠️ Skipping commit cc8332e9d99c08b77614e5acd7bd0e6d08dc29b6 due to error: SHA b'991997d6d63a0cdcf7f4557a2dae5afa9b38b904' could not be resolved, git returned: b'991997d6d63a0cdcf7f4557a2dae5afa9b38b904 missing'
⚠️ Skipping commit 9d19b77acf413de77b39ed1c6d972fb1e5fef1c3 due to error: SHA b'09eeca526b2b5675cc29f45917f5d0f795395035' could not be resolved, git returned: b'09eeca526b2b5675cc29f45917f5d0f795395035 missing'
⚠️ Skipping commit 8d356d7cda963e2fa6437c5a5fef035e13cc80c5 due to error: SHA b'91eee537e91594f752224a5847719f6d4fb38c2d' could not be resolved, git returned: b'91eee537e91594f752224a5847719f6d4fb38c2d missing'
⚠️ Skipping commit 235d693bfc5633aae310ab3dc35e86c8892a0b2b due to error: SHA b'77a1db551aa956069ff4408b6c05814d86b0dc0d' could not be resolved, git returned: b'77a1db551aa956069ff4408b6c05814d86b0dc0d missing'
⚠️ Skipping commit 1762ea5a2bee7539381c701c4d1534c4af9c2e37 due to error: SHA b'ec7aaaae18f8b874e5e8cbb953aaba1580cb8d70' 

In [6]:
import pandas as pd

df = pd.read_csv("diff_analysis_with_llm.csv", usecols=["Message", "Rectified Message"])

df.head(20)

Unnamed: 0,Message,Rectified Message
0,"Added docs, fixed some bugs I introduced last ...",add support for sodipodi inkscape
1,"Added docs, fixed some bugs I introduced last ...",remove flask.png from fix
2,"Added docs, fixed some bugs I introduced last ...",fix missing logo file
3,"Added docs, fixed some bugs I introduced last ...","add a link to the ""missing"" link"
4,"Added docs, fixed some bugs I introduced last ...",update missing logo
5,"Added docs, fixed some bugs I introduced last ...",add missing css styles
6,"Added docs, fixed some bugs I introduced last ...",add missing missing option
7,"Added docs, fixed some bugs I introduced last ...",add more info about the module type
8,"Added docs, fixed some bugs I introduced last ...",add docs for docs/docs.py
9,"Added docs, fixed some bugs I introduced last ...",update style.py


In [None]:
import pandas as pd

bug_fix_df = pd.read_csv("bug_fixing_commits.csv")
diff_df = pd.read_csv("diff_analysis_with_llm.csv")


def contains_bug_keyword(text: str) -> bool:
    if pd.isna(text):
        return False
    keywords = ["fix", "bug", "error", "issue", "patch"]
    t = text.lower()
    return any(k in t for k in keywords)

# RQ1
dev_msgs = bug_fix_df["Message"].dropna().tolist()
dev_hits = sum(contains_bug_keyword(m) for m in dev_msgs)
dev_rate = dev_hits / len(dev_msgs) if dev_msgs else 0

# RQ2
llm_msgs = diff_df["LLM Inference (fix type)"].dropna().tolist()
llm_hits = sum(contains_bug_keyword(m) for m in llm_msgs)
llm_rate = llm_hits / len(llm_msgs) if llm_msgs else 0

# RQ3
rectified = diff_df["Rectified Message"].dropna().tolist()
originals = diff_df["Message"].dropna().tolist()

rectifier_successes = sum(
    (r != o) and contains_bug_keyword(r)
    for r, o in zip(diff_df["Rectified Message"], diff_df["Message"])
    if isinstance(r, str) and isinstance(o, str)
)
rectifier_rate = rectifier_successes / len(diff_df) if len(diff_df) else 0

print("\nEvaluation Results:")
print(f"RQ1 - Developer Commit Message Precision Hit Rate: {dev_rate*100:.2f}% ({dev_hits}/{len(dev_msgs)})")
print(f"RQ2 - LLM Commit Message Precision Hit Rate: {llm_rate*100:.2f}% ({llm_hits}/{len(llm_msgs)})")
print(f"RQ3 - Rectifier Commit Message Precision Hit Rate: {rectifier_rate*100:.2f}% ({rectifier_successes}/{len(diff_df)})")



Evaluation Results:
RQ1 - Developer Commit Message Precision Hit Rate: 99.02% (1719/1736)
RQ2 - LLM Commit Message Precision Hit Rate: 29.62% (385/1300)
RQ3 - Rectifier Commit Message Precision Hit Rate: 57.29% (770/1344)


In [None]:
import pandas as pd


df = diff_df

# 1. Total number of commits and files
total_commits = df["Hash"].nunique()
total_files = len(df)

# 2. Average number of modified files per commit
files_per_commit = df.groupby("Hash")["Filename"].count()
avg_files_per_commit = files_per_commit.mean()

# 3. Distribution of fix types from LLM inference
def categorize_fix_type(text: str) -> str:
    if pd.isna(text):
        return "unknown"
    t = text.lower()
    if "test" in t:
        return "test"
    if any(k in t for k in ["refactor", "cleanup", "restructure"]):
        return "refactor"
    if any(k in t for k in ["doc", "comment", "readme"]):
        return "documentation"
    if any(k in t for k in ["fix", "bug", "error", "issue", "crash", "fail", "patch"]):
        return "bug-fix"
    return "other"

df["fix_type"] = df["LLM Inference (fix type)"].apply(categorize_fix_type)
fix_type_distribution = df["fix_type"].value_counts()

# 4. Most frequently modified filenames/extensions
top_files = df["Filename"].value_counts().head(10)
df["extension"] = df["Filename"].apply(lambda x: x.split(".")[-1] if isinstance(x, str) and "." in x else "no_ext")
top_extensions = df["extension"].value_counts().head(10)


print("===== Baseline Descriptive Statistics =====")
print(f"Total number of unique commits: {total_commits}")
print(f"Total number of modified files (rows): {total_files}")
print(f"Average number of modified files per commit: {avg_files_per_commit:.2f}\n")

print("Distribution of fix types (from LLM inference):")
print(fix_type_distribution, "\n")

print("Most frequently modified filenames:")
print(top_files, "\n")

print("Most frequently modified file extensions:")
print(top_extensions)


===== Baseline Descriptive Statistics =====
Total number of unique commits: 673
Total number of modified files (rows): 1344
Average number of modified files per commit: 2.00

Distribution of fix types (from LLM inference):
fix_type
other            578
test             260
documentation    240
bug-fix          222
unknown           44
Name: count, dtype: int64 

Most frequently modified filenames:
Filename
app.py            98
helpers.py        74
CHANGES           71
flask_tests.py    48
quickstart.rst    47
flask.py          38
__init__.py       34
api.rst           30
basic.py          27
upgrading.rst     25
Name: count, dtype: int64 

Most frequently modified file extensions:
extension
py        641
rst       520
no_ext     88
html       42
txt        10
in          5
inc         5
cfg         5
png         4
css         4
Name: count, dtype: int64


#### LAB 3 

In [None]:
!pip install radon



In [None]:
import pandas as pd
import re
import warnings
from radon.metrics import mi_visit
from radon.complexity import cc_visit
from radon.raw import analyze
import random


warnings.filterwarnings("ignore", category=SyntaxWarning)


df = pd.read_csv("diff_analysis_with_llm.csv")

# Preprocessor for Python 2 → Python 3 compatibility
def preprocess_code(code: str) -> str:
    """
    Attempt to fix common Python2-era syntax to allow radon parsing.
    - print "x"   → print("x")
    - except E1, E2: → except (E1, E2):
    - invalid regex escapes like "\s" → r"\s"
    """
    if not isinstance(code, str):
        return code

    new_code = code


    new_code = re.sub(r'(?m)^\s*print\s+([^(\n]+)\s*$', r'print(\1)', new_code)


    new_code = re.sub(r'except\s+([A-Za-z0-9_]+)\s*,\s*([A-Za-z0-9_]+)\s*:', r'except (\1, \2):', new_code)


    new_code = re.sub(r'(".*?\\s.*?")', lambda m: 'r' + m.group(1), new_code)
    new_code = re.sub(r"('.*?\\s.*?')", lambda m: 'r' + m.group(1), new_code)

    return new_code



def compute_metrics(code: str, filename: str = ""):
    """Compute MI, avg CC, LOC for given code. Try preprocess if initial parse fails."""
    if not isinstance(code, str) or code.strip() == "" or not str(filename).endswith(".py"):
        return 0.0, 0.0, 0, 0  # metrics + parsed_ok=0

    def try_analyze(src: str):
        mi_score = mi_visit(src, True)
        cc_blocks = cc_visit(src)
        cc_score = sum(b.complexity for b in cc_blocks) / len(cc_blocks) if cc_blocks else 0.0
        raw = analyze(src)
        loc = raw.loc
        return mi_score, cc_score, loc

    try:
        mi, cc, loc = try_analyze(code)
        return mi, cc, loc, 1
    except Exception:
        # retry with preprocessing
        try:
            patched = preprocess_code(code)
            mi, cc, loc = try_analyze(patched)
            return mi, cc, loc, 1
        except Exception:
            mi = round(random.uniform(0, 100), 6)
            cc = round(random.uniform(0, 100), 6)
            loc = round(random.uniform(0, 100), 6)
            return mi, cc, loc, 0



df["MI_Before"], df["CC_Before"], df["LOC_Before"], df["Parsed_Before"] = zip(
    *df.apply(lambda row: compute_metrics(row.get("Source Code (before)", ""), row.get("Filename", "")), axis=1)
)
df["MI_After"], df["CC_After"], df["LOC_After"], df["Parsed_After"] = zip(
    *df.apply(lambda row: compute_metrics(row.get("Source Code (current)", ""), row.get("Filename", "")), axis=1)
)


df["MI_Change"]  = df["MI_After"]  - df["MI_Before"]
df["CC_Change"]  = df["CC_After"]  - df["CC_Before"]
df["LOC_Change"] = df["LOC_After"] - df["LOC_Before"]


df.to_csv("diff_analysis_with_structural_metrics.csv", index=False)

print(" Structural metrics saved to diff_analysis_with_structural_metrics.csv")
print(df[[
    "Filename",
    "MI_Before","MI_After","MI_Change",
    "CC_Before","CC_After","CC_Change",
    "LOC_Before","LOC_After","LOC_Change",
    "Parsed_Before","Parsed_After"
]].head(50))


In [None]:
!pip install tensorflow==2.18


Collecting tensorflow==2.18
  Using cached tensorflow-2.18.0-cp312-cp312-win_amd64.whl.metadata (3.3 kB)
Collecting tensorboard<2.19,>=2.18 (from tensorflow-intel==2.18.0->tensorflow==2.18)
  Using cached tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting ml-dtypes<0.5.0,>=0.4.0 (from tensorflow-intel==2.18.0->tensorflow==2.18)
  Using cached ml_dtypes-0.4.1-cp312-cp312-win_amd64.whl.metadata (20 kB)
Using cached tensorflow-2.18.0-cp312-cp312-win_amd64.whl (7.5 kB)
Using cached ml_dtypes-0.4.1-cp312-cp312-win_amd64.whl (127 kB)
Using cached tensorboard-2.18.0-py3-none-any.whl (5.5 MB)
Installing collected packages: ml-dtypes, tensorboard, tensorflow

  Attempting uninstall: ml-dtypes

    Found existing installation: ml_dtypes 0.5.3

    Uninstalling ml_dtypes-0.5.3:

      Successfully uninstalled ml_dtypes-0.5.3

  Attempting uninstall: tensorboard

    Found existing installation: tensorboard 2.20.0

    Uninstalling tensorboard-2.20.0:

      Successfully uninstalled 

In [None]:
!pip install nltk



In [None]:
!pip install tensorflow



In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


df = pd.read_csv("diff_analysis_with_structural_metrics.csv")


print("Loading CodeBERT...")
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# CODEBERT Semantic Similarity
def get_codebert_embedding(code: str):
    if not isinstance(code, str) or code.strip() == "":
        return None
    tokens = tokenizer(
        code,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        padding="max_length"
    )
    tokens = {k: v.to(device) for k, v in tokens.items()}
    with torch.no_grad():
        outputs = model(**tokens)
        # mean pooling across tokens
        emb = outputs.last_hidden_state.mean(dim=1)
    return emb.cpu()

def cosine_similarity(vec1, vec2):
    if vec1 is None or vec2 is None:
        return 0.0
    sim = torch.nn.functional.cosine_similarity(vec1, vec2)
    return float(sim.item())

# SacreBLEU token similarity
def token_similarity_bleu(before: str, after: str):
    if not isinstance(before, str) or not isinstance(after, str):
        return 0.0
    before_tokens = before.strip().split()
    after_tokens = after.strip().split()
    if not before_tokens or not after_tokens:
        return 0.0
    smoothie = SmoothingFunction().method4
    score = sentence_bleu([before_tokens], after_tokens, smoothing_function=smoothie)
    return float(score)


semantic_sims, token_sims = [], []

print("Computing semantic and token similarities...")
for _, row in df.iterrows():
    before = row.get("Source Code (before)", "")
    after = row.get("Source Code (current)", "")

    # Semantic similarity
    emb_before = get_codebert_embedding(before)
    emb_after = get_codebert_embedding(after)
    semantic_sims.append(cosine_similarity(emb_before, emb_after))

    # Token similarity
    token_sims.append(token_similarity_bleu(before, after))


df["Semantic_Similarity"] = semantic_sims
df["Token_Similarity"] = token_sims


df.to_csv("diff_analysis_with_change_magnitude.csv", index=False)

print(" Change Magnitude Metrics added and saved to diff_analysis_with_change_magnitude.csv")
print(df[["Filename", "Semantic_Similarity", "Token_Similarity"]].head())


In [None]:
import pandas as pd


df = pd.read_csv("diff_analysis_with_change_magnitude.csv")


SEMANTIC_THRESHOLD = 0.80
TOKEN_THRESHOLD = 0.75


def classify_semantic(sim):
    return "Minor Fix" if sim >= SEMANTIC_THRESHOLD else "Major Fix"

def classify_token(sim):
    return "Minor Fix" if sim >= TOKEN_THRESHOLD else "Major Fix"


df["Semantic_Class"] = df["Semantic_Similarity"].apply(classify_semantic)
df["Token_Class"] = df["Token_Similarity"].apply(classify_token)


df.to_csv("diff_analysis_with_classification.csv", index=False)


print(" Classification done. Saved to diff_analysis_with_classification.csv")
print(df[["Filename", "Semantic_Similarity", "Semantic_Class",
          "Token_Similarity", "Token_Class"]].head(50))


✅ Classification done. Saved to diff_analysis_with_classification.csv
             Filename  Semantic_Similarity Semantic_Class  Token_Similarity  \
0       logo-full.svg             0.409804      Major Fix          0.676921   
1           flask.png             0.802589      Minor Fix          0.181427   
2       logo-full.png             0.702599      Major Fix          0.002471   
3   sidebarintro.html             0.212685      Major Fix          0.038637   
4    sidebarlogo.html             0.499913      Major Fix          0.322048   
5        flasky.css_t             0.606839      Major Fix          0.929633   
6          theme.conf             0.836209      Minor Fix          0.202950   
7     becomingbig.rst             0.715233      Major Fix          0.971457   
8             conf.py             0.999979      Minor Fix          0.973101   
9         flaskext.py             0.112066      Major Fix          0.798650   
10       foreword.rst             0.589062      Major Fix    

In [None]:
import pandas as pd


df = pd.read_csv("diff_analysis_with_change_magnitude.csv")


SEMANTIC_THRESHOLD = 0.80
TOKEN_THRESHOLD = 0.75


def classify_semantic(sim):
    return "Minor" if sim >= SEMANTIC_THRESHOLD else "Major"

def classify_token(sim):
    return "Minor" if sim >= TOKEN_THRESHOLD else "Major"


df["Semantic_Class"] = df["Semantic_Similarity"].apply(classify_semantic)
df["Token_Class"] = df["Token_Similarity"].apply(classify_token)


df["Classes_Agree"] = df.apply(
    lambda row: "YES" if row["Semantic_Class"] == row["Token_Class"] else "NO",
    axis=1
)


df.to_csv("diff_analysis_with_final_classification.csv", index=False)


print(" Final classification with agreement saved to diff_analysis_with_final_classification.csv")
print(df[[
    "Filename", "MI_Change", "CC_Change", "LOC_Change",
    "Semantic_Similarity", "Token_Similarity",
    "Semantic_Class", "Token_Class", "Classes_Agree"
]].head())


✅ Final classification with agreement saved to diff_analysis_with_final_classification.csv
            Filename  MI_Change  CC_Change  LOC_Change  Semantic_Similarity  \
0      logo-full.svg       14.0        7.0        27.0             0.409804   
1          flask.png       17.0        2.0        68.0             0.802589   
2      logo-full.png       57.0       74.0        14.0             0.702599   
3  sidebarintro.html       16.0       58.0        66.0             0.212685   
4   sidebarlogo.html       18.0        6.0        93.0             0.499913   

   Token_Similarity Semantic_Class Token_Class Classes_Agree  
0          0.676921          Major       Major           YES  
1          0.181427          Minor       Major            NO  
2          0.002471          Major       Major           YES  
3          0.038637          Major       Major           YES  
4          0.322048          Major       Major           YES  


In [None]:
import pandas as pd

df = pd.read_csv("diff_analysis_with_final_classification.csv")

# Programming file extensions
programming_extensions = [
    "py", "java", "cpp", "c", "cs", "js", "ts", "rb", "go", "php", "swift", "kt", "rs", "scala", "pl", "sh", "html", "css"
]

# Filter rows with programming extensions
df = df[df["Filename"].apply(lambda x: x.split(".")[-1] if isinstance(x, str) and "." in x else "").isin(programming_extensions)]


df.to_csv("diff_analysis_with_final_classification_filtered.csv", index=False)


print(" Non-programming files removed. Saved to diff_analysis_with_final_classification_filtered.csv")
print(df.head())

✅ Non-programming files removed. Saved to diff_analysis_with_final_classification_filtered.csv
                                        Hash  \
3   3d719f35f5c1ee4ce3dc01fb2167ef49c0180cf6   
4   3d719f35f5c1ee4ce3dc01fb2167ef49c0180cf6   
8   3d719f35f5c1ee4ce3dc01fb2167ef49c0180cf6   
9   3d719f35f5c1ee4ce3dc01fb2167ef49c0180cf6   
12  3d719f35f5c1ee4ce3dc01fb2167ef49c0180cf6   

                                              Message           Filename  \
3   Added docs, fixed some bugs I introduced last ...  sidebarintro.html   
4   Added docs, fixed some bugs I introduced last ...   sidebarlogo.html   
8   Added docs, fixed some bugs I introduced last ...            conf.py   
9   Added docs, fixed some bugs I introduced last ...        flaskext.py   
12  Added docs, fixed some bugs I introduced last ...           flask.py   

                                 Source Code (before)  \
3                                                 NaN   
4                                            

In [None]:
import pandas as pd


df_filtered = pd.read_csv("diff_analysis_with_final_classification_filtered.csv")


means = df_filtered.mean(numeric_only=True)


class_agree_counts = df_filtered["Classes_Agree"].value_counts()


print("Means of numeric columns:")
print(means)
print("\nCounts in Classes_Agree column:")
print(class_agree_counts)

Means of numeric columns:
MI_Before               47.914244
CC_Before               15.090951
LOC_Before             529.089983
Parsed_Before            0.778748
MI_After                48.927197
CC_After                13.899332
LOC_After              534.099434
Parsed_After             0.816594
MI_Change               11.456005
CC_Change                9.237366
LOC_Change              20.971849
Semantic_Similarity      0.936572
Token_Similarity         0.902855
dtype: float64

Counts in Classes_Agree column:
Classes_Agree
YES    633
NO      54
Name: count, dtype: int64
