In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
import numpy as np


In [None]:
pd.set_option('display.max_rows', 500)

In [None]:
df_manual = pd.read_csv('processed/sara.csv', encoding="ISO-8859-1")
df_manual.describe(include='all').T

In [None]:
df_ai = pd.read_csv('processed/processed_survey_data.csv')
df_ai.describe(include='all').T

In [None]:
# Standardize column names
df_manual.columns = df_manual.columns.str.strip()
df_ai.columns = df_ai.columns.str.strip()

# Rename columns in AI dataset for consistency
df_ai.rename(columns={
    "Respondent ID": "respondent_id",
    "Teachers": "Teachers",
    "Concern": "Concern",
    "Communication": "Communication",
    "Good Outcomes": "Good Outcomes",
    "Policies & Administration": "Policies/ Administration",
    "Culture & Virtues": "Culture/ Virtues",
    "Extra-curriculars & Sports": "Extra-curriculars/ Sports",
    "Facilities": "Facilities",
    "Curriculum": "Curriculum"
}, inplace=True)

# Merge datasets on respondent_id
merged = df_manual.merge(df_ai, on="respondent_id", suffixes=("_manual", "_ai"))

# Identify tag columns
tag_columns = [
    "Concern", "Curriculum", "Good Outcomes",
    "Policies/ Administration", "Teachers", "Culture/ Virtues", "Communication", "Community",
    "Extra-curriculars/ Sports", "Facilities"
]

# Find mismatches
mismatches = []
for tag in tag_columns:
    mismatch_rows = merged[(merged[f"{tag}_manual"]=="Yes") != merged[f"{tag}_ai"]]
    for _, row in mismatch_rows.iterrows():
        mismatches.append({
            "respondent_id": row["respondent_id"],
            "tag": tag,
            "manual_value": row[f"{tag}_manual"]=="Yes",
            "ai_value": row[f"{tag}_ai"],
            "response": row["response"] if "response" in row else "N/A"
        })

# Convert to DataFrame and display summary
mismatch_df = pd.DataFrame(mismatches)
print("Tag Mismatch Counts:")
print(mismatch_df["tag"].value_counts())
mismatch_df

In [None]:
mismatch_df

In [None]:
manual_flat = df_manual.melt(
    id_vars=["respondent_id"], value_vars=tag_columns, var_name="tag", value_name="manual_value"
)
manual_flat = manual_flat[~manual_flat["manual_value"].isna()].groupby("respondent_id")["tag"].apply(list).reset_index()
manual_flat.rename(columns={"tag": "manual_tags"}, inplace=True)
manual_flat.head()

In [None]:
ai_flat = df_ai.melt(
    id_vars=["respondent_id"], value_vars=tag_columns, var_name="tag", value_name="ai_value"
)
ai_flat = ai_flat[ai_flat["ai_value"]].groupby("respondent_id")["tag"].apply(list).reset_index()
ai_flat.rename(columns={"tag": "ai_tags"}, inplace=True)
ai_flat.head()

In [None]:
columns_to_keep = [
    "respondent_id", "manual_tags", "ai_tags", "common_tags", "different_tags",
    "(Grammar) What makes GVCA a good choice for you and your family?",
    "(Middle) What makes GVCA a good choice for you and your family?",
    "(Upper) What makes GVCA a good choice for you and your family?",
    "(Generic) What makes GVCA a good choice for you and your family?",
    "(Grammar) Please provide us with examples of how GVCA can better serve you and your family.",
    "(Middle) Please provide us with examples of how GVCA can better serve you and your family.",
    "(Upper) Please provide us with examples of how GVCA can better serve you and your family.",
    "(Generic) Please provide us with examples of how GVCA can better serve you and your family."
]

merged = manual_flat.merge(ai_flat, on="respondent_id", how="outer").fillna("[]")
merged["manual_tags"] = merged["manual_tags"].apply(lambda x: set(x))
merged["ai_tags"] = merged["ai_tags"].apply(lambda x: set(x))
merged["common_tags"] = merged.apply(lambda row: len(row["manual_tags"] & row["ai_tags"]), axis=1)
merged["different_tags"] = merged.apply(lambda row: len(row["manual_tags"] ^ row["ai_tags"]), axis=1)
merged = merged.merge(df_ai, on="respondent_id", how="left")
output = merged[columns_to_keep]
output

In [None]:
output['different_tags'].value_counts()

In [None]:
output['common_tags'].value_counts()

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(output['common_tags'], bins=7, alpha=0.5, label="N Tags Common", color='blue')
plt.hist(output['different_tags'], bins=7, alpha=0.5, label="N Tags Different", color='red')
plt.xlabel("Date")
plt.ylabel("Count")
plt.legend()
plt.title("Distribution of Tags in Common / Different")
plt.xticks(rotation=45)
plt.savefig(
    f"artifacts/AI-Manual Tagging Comparison",
    transparent=True,
)
plt.show()

In [None]:
output.to_csv('tag_comparison.csv', index=False)

In [None]:
# Convert sets back to lists for proper binary encoding
output.loc[:, "manual_tags"] = output["manual_tags"].apply(list)
output.loc[:, "ai_tags"] = output["ai_tags"].apply(list)

# Create binary columns for each tag
for tag in tag_columns:
    output.loc[:, f"manual_{tag}"] = output["manual_tags"].apply(lambda tags: tag in tags)
    output.loc[:, f"ai_{tag}"] = output["ai_tags"].apply(lambda tags: tag in tags)

# Compute correlation between AI and manual labels
correlation_results = {tag: output[f"manual_{tag}"].corr(output[f"ai_{tag}"]) for tag in tag_columns}

# Convert correlation results to a DataFrame
correlation_df = pd.DataFrame.from_dict(correlation_results, orient='index', columns=['correlation']).reset_index()
correlation_df.rename(columns={'index': 'tag'}, inplace=True)
correlation_df = correlation_df.sort_values(by='correlation', ascending=False)
correlation_df

In [None]:
# Compute confusion matrices for each tag
confusion_matrices = {}
for tag in tag_columns:
    y_true = output[f"manual_{tag}"].astype(int)
    y_pred = output[f"ai_{tag}"].astype(int)
    cm = confusion_matrix(y_true, y_pred, labels=[1, 0])
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
    print(tag, f1, precision, recall)
    print(cm)
    print()
    # confusion_matrices[tag] = cm