##Track 1 mistral ai

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
import json, zipfile, random, re, math
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.calibration import CalibratedClassifierCV
import xgboost
import torch
import torch.nn as nn
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
)
from sentence_transformers import SentenceTransformer, util
import warnings
warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)

# -------------------------
# Load & flatten Track 1 dataset
# -------------------------
def load_flat(json_file, has_labels=True, label_key="Mistake_Identification"):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    rows = []
    for item in data:
        cid = item.get("conversation_id")
        ctx = item.get("conversation_history", "")
        tutors = item.get("tutor_responses", {})
        for tutor, info in tutors.items():
            resp = info.get("response", "")
            label = None
            if has_labels and isinstance(info.get("annotation"), dict):
                label = info["annotation"].get(label_key)
            rows.append({
                "conversation_id": cid,
                "tutor": tutor,
                "conversation_history": ctx,
                "response": resp,
                "text": ctx + "\n\nTUTOR: " + resp,
                "label": label
            })
    return pd.DataFrame(rows)

# -------------------------
# Feature engineering
# -------------------------
sbert = SentenceTransformer("all-mpnet-base-v2")

def add_features(df):
    sims = []
    for i in range(0, len(df), 64):
        batch = df.iloc[i:i+64]
        emb_ctx = sbert.encode(batch['conversation_history'].astype(str).tolist(), convert_to_tensor=True)
        emb_resp = sbert.encode(batch['response'].astype(str).tolist(), convert_to_tensor=True)
        cos = util.pytorch_cos_sim(emb_ctx, emb_resp).diagonal().cpu().numpy()
        sims.extend(cos.tolist())
    df = df.copy().reset_index(drop=True)
    df['sim'] = sims
    df['has_error_terms'] = df['response'].str.contains(r'\b(mistake|error|wrong|incorrect|issue|problem)\b', case=False, na=False).astype(int)
    df['has_fix_terms'] = df['response'].str.contains(r'\b(correct|should|instead|try|fix|check)\b', case=False, na=False).astype(int)
    df['has_question'] = df['response'].str.contains(r'\?', na=False).astype(int)
    df['len_resp'] = df['response'].str.len().fillna(0).astype(int)
    return df

# -------------------------
# Oversampling
# -------------------------
def oversample_minority(df, label_col='label', target_per_class=1000):
    counts = df[label_col].value_counts().to_dict()
    parts = []
    for lbl, grp in df.groupby(label_col):
        if len(grp) >= target_per_class:
            parts.append(grp.sample(n=target_per_class, random_state=SEED))
        else:
            need = target_per_class - len(grp)
            sampled = grp.sample(n=need, replace=True, random_state=SEED)
            sampled['response'] = sampled['response'].apply(lambda x: x.replace("Let's", "Let us"))
            sampled['text'] = sampled['conversation_history'] + "\n\nTUTOR: " + sampled['response']
            parts.append(pd.concat([grp, sampled]))
    return pd.concat(parts).sample(frac=1, random_state=SEED).reset_index(drop=True)

# -------------------------
# Classical ensemble (TF-IDF + numeric)
# -------------------------
def train_feature_ensemble(train_df, feat_cols, label_col='label'):
    tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=3)
    X_text = tfidf.fit_transform(train_df['text'])
    X_num = train_df[feat_cols].fillna(0).to_numpy()
    X = np.hstack([X_text.toarray(), X_num])
    y = train_df[label_col].astype(int).to_numpy()
    rf = RandomForestClassifier(n_estimators=300, class_weight='balanced_subsample', random_state=SEED)
    lr = LogisticRegression(max_iter=2000, class_weight='balanced', solver='liblinear')
    xgb = xgboost.XGBClassifier(n_estimators=200, random_state=SEED, eval_metric='logloss', use_label_encoder=False)
    voting = VotingClassifier(estimators=[('rf',rf),('lr',lr),('xgb',xgb)], voting='soft')
    voting.fit(X, y)
    cal = CalibratedClassifierCV(voting, method='isotonic', cv=3)
    cal.fit(X, y)
    return tfidf, cal

# -------------------------
# Transformer Trainer
# -------------------------
class WeightedTrainer(Trainer):
    def __init__(self, class_weight_tensor=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weight_tensor = class_weight_tensor.to(self.model.device) if class_weight_tensor is not None else None
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels").to(model.device)
        outputs = model(**{k:v.to(model.device) for k,v in inputs.items() if k!="labels"})
        loss_fct = nn.CrossEntropyLoss(weight=self.class_weight_tensor) if self.class_weight_tensor is not None else nn.CrossEntropyLoss()
        loss = loss_fct(outputs.logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"f1": f1_score(labels, preds, average="macro"), "accuracy": accuracy_score(labels, preds)}

def prepare_hf_dataset(df, tokenizer, text_col='text', label_col='label', max_length=384):
    df = df[df[label_col].notnull()].reset_index(drop=True)
    enc = tokenizer(df[text_col].tolist(), truncation=True, padding='max_length', max_length=max_length)
    enc['labels'] = df[label_col].astype(int).tolist()
    return Dataset.from_dict(enc), df

def train_transformer(train_df, val_df, model_name='microsoft/deberta-v3-base', num_labels=2, epochs=5, text_col='text'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    y = train_df['label'].astype(int).to_numpy()
    cw = compute_class_weight('balanced', classes=np.unique(y), y=y)
    class_weights = torch.tensor(cw, dtype=torch.float)
    train_ds,_ = prepare_hf_dataset(train_df, tokenizer, text_col)
    val_ds,_ = prepare_hf_dataset(val_df, tokenizer, text_col)
    args = TrainingArguments(
        output_dir="./tmp_mistake_model",
        eval_strategy="steps", eval_steps=200, logging_steps=50,
        learning_rate=2e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8,
        num_train_epochs=epochs, weight_decay=0.01,
        save_total_limit=1, load_best_model_at_end=True,save_steps=200,
        metric_for_best_model="f1", report_to="none"
    )
    trainer = WeightedTrainer(
        class_weight_tensor=class_weights, model=model, args=args,
        train_dataset=train_ds, eval_dataset=val_ds,
        data_collator=DataCollatorWithPadding(tokenizer),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
        compute_metrics=compute_metrics
    )
    trainer.train()
    return tokenizer, trainer

def predict_transformer_probs(tokenizer, trainer, df, text_col='text'):
    enc = tokenizer(df[text_col].tolist(), truncation=True, padding='max_length', max_length=384)
    ds = Dataset.from_dict(enc)
    out = trainer.predict(ds)
    return torch.softmax(torch.tensor(out.predictions), dim=1).numpy()

# -------------------------
# Label mapping
# -------------------------
LABEL_TO_INT = {'No':0, 'Yes':1, 'To some extent':2}
INT_TO_LABEL = {v:k for k,v in LABEL_TO_INT.items()}
def map_stageA_label(lbl):
    if lbl=='No': return 0
    if lbl in ('Yes','To some extent'): return 1
    return None
def map_stageB_label(lbl):
    if lbl=='Yes': return 1
    if lbl=='To some extent': return 0
    return None

# -------------------------
# Blending & Threshold
# -------------------------
def tune_threshold(probs, y_true):
    best=(0.5,0)
    for t in np.linspace(0.1,0.9,17):
        f1=f1_score(y_true,(probs[:,1]>t).astype(int),average='macro')
        if f1>best[1]: best=(t,f1)
    return best
def find_best_blend(probs_t, probs_e, y_true):
    best=(0,-1,0.5)
    for w in np.linspace(0.3,1,8):
        p=w*probs_t+(1-w)*probs_e
        th,f1=tune_threshold(p,y_true)
        if f1>best[1]: best=(w,f1,th)
    return best

# -------------------------
# Hierarchical pipeline
# -------------------------
def train_hierarchical_pipeline(train_df_raw):
    print("Adding features...")
    train_df=add_features(train_df_raw)
    train_df['label']=train_df['label'].fillna('None')
    print("Label counts:", train_df['label'].value_counts().to_dict())

    # --- Stage A
    print("\n--- Stage A: Mistake identified vs No ---")
    train_df['labelA']=train_df['label'].apply(map_stageA_label)
    dfA=train_df[train_df['labelA'].notnull()].copy()
    dfA['label']=dfA['labelA']
    dfA_os=oversample_minority(dfA, label_col='label', target_per_class=1000)
    feat_cols=['sim','has_error_terms','has_fix_terms','has_question','len_resp']
    tfidfA,ensembleA=train_feature_ensemble(dfA_os,feat_cols)
    trA,valA=train_test_split(dfA_os,test_size=0.15,stratify=dfA_os['label'],random_state=SEED)
    tokA,trAmodel=train_transformer(trA,valA,num_labels=2)
    valA_df=dfA.copy().reset_index(drop=True); valA_df['label_num']=valA_df['labelA']
    p_tA=predict_transformer_probs(tokA,trAmodel,valA_df)
    X_text=tfidfA.transform(valA_df['text']); X_num=valA_df[feat_cols].to_numpy()
    p_eA=ensembleA.predict_proba(np.hstack([X_text.toarray(),X_num]))
    wA,f1A,thA=find_best_blend(p_tA,p_eA,valA_df['label_num'])
    print(f"Stage A: w={wA:.2f}, thresh={thA:.2f}, F1={f1A:.4f}")

    # --- Stage B
    print("\n--- Stage B: Among identified -> Yes vs To some extent ---")
    sub=train_df[train_df['label'].isin(['Yes','To some extent'])].copy()
    sub['labelB']=sub['label'].apply(map_stageB_label)
    sub['label']=sub['labelB']
    sub_os=oversample_minority(sub, label_col='label', target_per_class=800)
    tfidfB,ensembleB=train_feature_ensemble(sub_os,feat_cols)
    trB,valB=train_test_split(sub_os,test_size=0.15,stratify=sub_os['label'],random_state=SEED)
    tokB,trBmodel=train_transformer(trB,valB,num_labels=2)
    valB_df=sub.copy().reset_index(drop=True); valB_df['label_num']=valB_df['labelB']
    p_tB=predict_transformer_probs(tokB,trBmodel,valB_df)
    X_textB=tfidfB.transform(valB_df['text']); X_numB=valB_df[feat_cols].to_numpy()
    p_eB=ensembleB.predict_proba(np.hstack([X_textB.toarray(),X_numB]))
    wB,f1B,thB=find_best_blend(p_tB,p_eB,valB_df['label_num'])
    print(f"Stage B: w={wB:.2f}, thresh={thB:.2f}, F1={f1B:.4f}")

    return {'tfidfA':tfidfA,'ensA':ensembleA,'tokA':tokA,'modA':trAmodel,'wA':wA,'thA':thA,
            'tfidfB':tfidfB,'ensB':ensembleB,'tokB':tokB,'modB':trBmodel,'wB':wB,'thB':thB,'feat_cols':feat_cols}

# -------------------------
# Inference
# -------------------------
def predict_hierarchical(art, df):
    df=add_features(df)
    feat=art['feat_cols']
    # Stage A
    p_tA=predict_transformer_probs(art['tokA'],art['modA'],df)
    X_text=art['tfidfA'].transform(df['text']); X_num=df[feat].to_numpy()
    p_eA=art['ensA'].predict_proba(np.hstack([X_text.toarray(),X_num]))
    probsA=art['wA']*p_tA+(1-art['wA'])*p_eA
    predsA=(probsA[:,1]>art['thA']).astype(int)
    out=['']*len(df)
    idx_yes=[i for i,p in enumerate(predsA) if p==1]
    for i,p in enumerate(predsA):
        if p==0: out[i]='No'
    if idx_yes:
        sub=df.iloc[idx_yes].reset_index(drop=True)
        p_tB=predict_transformer_probs(art['tokB'],art['modB'],sub)
        X_textB=art['tfidfB'].transform(sub['text']); X_numB=sub[feat].to_numpy()
        p_eB=art['ensB'].predict_proba(np.hstack([X_textB.toarray(),X_numB]))
        probsB=art['wB']*p_tB+(1-art['wB'])*p_eB
        predsB=(probsB[:,1]>art['thB']).astype(int)
        for j,idx in enumerate(idx_yes):
            out[idx]='Yes' if predsB[j]==1 else 'To some extent'
    return out

# -------------------------
# Build submission
# -------------------------
def build_submission(df, preds, out_zip="predictions_mistake_identification.json.zip", label_key="Mistake_Identification"):
    grouped={}
    for i,row in df.iterrows():
        cid=row['conversation_id']
        if cid not in grouped:
            grouped[cid]={'conversation_id':cid,'conversation_history':row['conversation_history'],'tutor_responses':{}}
        grouped[cid]['tutor_responses'][row['tutor']]={'response':row['response'],'annotation':{label_key:preds[i]}}
    out=list(grouped.values())
    with open("predictions_mistake_identification.json","w",encoding="utf-8") as f:
        json.dump(out,f,ensure_ascii=False,indent=2)
    with zipfile.ZipFile(out_zip,'w',compression=zipfile.ZIP_DEFLATED) as z:
        z.write("predictions_mistake_identification.json",arcname="predictions_mistake_identification.json")
    print(f"Wrote {out_zip} with {len(out)} conversations.")
    return out

In [None]:
# -------------------------
# MAIN
# -------------------------
if __name__ == "__main__":
    TRAIN="trainset.json"; DEV="dev_testset.json"; TEST="testset.json"
    tr=load_flat(TRAIN,has_labels=True)
    dev=load_flat(DEV,has_labels=False)
    test=load_flat(TEST,has_labels=False)
    print("Shapes:", tr.shape, dev.shape, test.shape)
    arts=train_hierarchical_pipeline(tr)
    print("Predicting dev/test...")
    dev_preds=predict_hierarchical(arts,dev)
    test_preds=predict_hierarchical(arts,test)
    print(pd.Series(test_preds).value_counts())
    build_submission(dev,dev_preds,"dev_mistake_identification.json.zip")
    build_submission(test,test_preds,"test_mistake_identification.json.zip")
    print("Done.")


Shapes: (2476, 6) (333, 6) (1214, 6)
Adding features...
Label counts: {'Yes': 1932, 'No': 370, 'To some extent': 174}

--- Stage A: Mistake identified vs No ---


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,F1,Accuracy
200,0.4036,0.331052,0.823529,0.826667
400,0.3385,0.417522,0.847145,0.85
600,0.2634,0.403678,0.868206,0.87
800,0.1851,0.602687,0.854203,0.856667
1000,0.1411,0.576121,0.851645,0.853333


Stage A: w=0.30, thresh=0.30, F1=0.9580

--- Stage B: Among identified -> Yes vs To some extent ---


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,F1,Accuracy
200,0.6928,0.666209,0.602732,0.6375
400,0.6146,0.485913,0.787319,0.7875
600,0.4395,0.540188,0.795904,0.8
800,0.3049,0.410204,0.870327,0.870833


Stage B: w=0.30, thresh=0.30, F1=0.9328
Predicting dev/test...


Yes               1046
No                 150
To some extent      18
Name: count, dtype: int64
Wrote dev_mistake_identification.json.zip with 41 conversations.
Wrote test_mistake_identification.json.zip with 150 conversations.
Done.
