In [None]:
import pandas as pd
import re
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from rapidfuzz import fuzz
import os
import numpy as np

In [None]:
# Load English NLP model
nlp = spacy.load("en_core_web_sm")

In [None]:
# ---------- Similarity Functions ----------
def jaccard_similarity(a, b):
    a_set, b_set = set(a.split()), set(b.split())
    if not a_set or not b_set:
        return 0
    return len(a_set & b_set) / len(a_set | b_set)

def cosine_sim(a, b):
    vectorizer = CountVectorizer().fit([a, b])
    vectors = vectorizer.transform([a, b])
    return cosine_similarity(vectors)[0][1]

In [None]:
# ---------- Main Rule-Based Scoring ----------
def classify_response(response, correct_answer, high_threshold, low_threshold):
    response = str(response).strip().lower()
    correct_answer = str(correct_answer).strip().lower()

    if response == "" or response in ["idk", "i don't know", "i dunno"]:
        return -1

    fuzzy_ratio = fuzz.token_sort_ratio(response, correct_answer) / 100
    jaccard = jaccard_similarity(response, correct_answer)
    cosine = cosine_sim(response, correct_answer)

    try:
        semantic = nlp(response).similarity(nlp(correct_answer))
    except Exception:
        semantic = 0

    resp_nums = re.findall(r"\d+", response)
    corr_nums = re.findall(r"\d+", correct_answer)
    num_match = 1 if resp_nums and resp_nums == corr_nums else 0

    # Weighted total score
    total_score = (
        0.4 * fuzzy_ratio +
        0.2 * jaccard +
        0.2 * semantic +
        0.15 * cosine +
        0.05 * num_match
    )

    # Threshold-based classification
    if total_score > high_threshold:
        return 1
    elif total_score > low_threshold:
        return 0
    else:
        return -1


In [None]:
# ---------- Main Loop ----------
df = pd.read_csv("train_cleaned.csv", encoding="latin1")
results = []

for high_threshold in [round(x, 2) for x in np.arange(0.60, 0.651, 0.01)]:
    for low_threshold in [round(x, 2) for x in np.arange(0.55, high_threshold, 0.01)]:
        temp_df = df.copy()

        temp_df["Predicted_Label"] = temp_df.apply(
            lambda row: classify_response(row["Response"], row["CorrectAnswer"], high_threshold, low_threshold),
            axis=1
        )

        # Evaluate accuracy
        if "label" in temp_df.columns:
            total_rows = len(temp_df)
            correct_predictions = (temp_df["Predicted_Label"] == temp_df["label"]).sum()
            accuracy = correct_predictions / total_rows * 100
        else:
            accuracy = 0

        print(f"Tested: high={high_threshold}, low={low_threshold} ‚Üí Accuracy={accuracy:.2f}%")

        results.append({
            "result_1_limit": high_threshold,
            "result_0_limit": low_threshold,
            "accuracy": accuracy
        })

        temp_df.to_csv("train_responses.csv", index=False)

        # Delete temp file
        if os.path.exists("train_responses.csv"):
            os.remove("train_responses.csv")

In [None]:
# ---------- Save Final Summary ----------
results_df = pd.DataFrame(results)
results_df.to_csv("threshold_results_train.csv", index=False)
print("\n‚úÖ All threshold combinations tested.")
print("üìÑ Results saved to 'threshold_results.csv'.")

In [None]:
# ---------- Print Best Thresholds ----------
best = results_df.loc[results_df['accuracy'].idxmax()]
print(f"\nüèÜ Best thresholds found:")
print(f"Result=1 limit: {best['result_1_limit']}")
print(f"Result=0 limit: {best['result_0_limit']}")
print(f"Accuracy: {best['accuracy']:.2f}%")