In [1]:
"""
Subtask B - New Submission
==========================

- Model: TF-IDF (char n-grams) + SGDClassifier (log-loss, linear classifier)
- Goal: Improve generalization & macro-F1 using:
    * class_weight="balanced"
    * log-loss (probabilistic) instead of hinge
    * slightly tuned n-gram range and feature size

This script:
1. Loads the SemEval-2026 Task 13 Subtask B data.
2. Builds TF-IDF features from language + code.
3. Runs a small hyperparameter search on a *subset* of train for speed.
4. Trains a final model on full train + validation using the best config.
5. Evaluates on test_sample.parquet (local only).
6. Generates a submission CSV for Kaggle.
"""

import os
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.utils import shuffle

import joblib


# ------------------------------------------------------------------
# 1. Paths & basic setup
# ------------------------------------------------------------------

def set_working_directory():
    """
    Set the working directory to the Task_B folder on your MacBook.
    Change the path below if your folder is in a different location.
    """
    task_b_path = "/Users/shanuaimi/Desktop/Natural Language Processing /Task_B"
    os.chdir(task_b_path)
    print("Current working directory:", os.getcwd())
    print("Files in this folder:", os.listdir())


# ------------------------------------------------------------------
# 2. Data loading & text construction
# ------------------------------------------------------------------

def load_datasets():
    """
    Load all parquet files for Subtask B:
    - train.parquet
    - validation.parquet
    - test.parquet
    - test_sample.parquet
    Returns: train_df, val_df, test_df, test_sample_df
    """
    train_df = pd.read_parquet("train.parquet")
    val_df = pd.read_parquet("validation.parquet")
    test_df = pd.read_parquet("test.parquet")
    test_sample_df = pd.read_parquet("test_sample.parquet")

    # Ensure labels are integers
    train_df["label"] = train_df["label"].astype(int)
    val_df["label"] = val_df["label"].astype(int)
    test_sample_df["label"] = test_sample_df["label"].astype(int)

    print(f"Train size      : {len(train_df)}")
    print(f"Validation size : {len(val_df)}")
    print(f"Test size       : {len(test_df)}")
    print(f"Test_sample size: {len(test_sample_df)}")

    return train_df, val_df, test_df, test_sample_df


def build_text_column(df: pd.DataFrame) -> pd.Series:
    """
    Construct the text input for TF-IDF from language + code.

    We keep preprocessing minimal on purpose:
    - language: gives a strong signal (Python, C++, Java, etc.)
    - code: raw source code, no stripping or cleaning
    """
    code = df["code"].fillna("")
    if "language" in df.columns:
        lang = df["language"].fillna("")
        text = (lang + " " + code).astype(str)
    else:
        text = code.astype(str)
    return text


# ------------------------------------------------------------------
# 3. Model building
# ------------------------------------------------------------------

def build_pipeline(ngram_range=(3, 5), max_features=150_000, alpha=5e-5):
    """
    Build a TF-IDF + SGDClassifier pipeline.

    Parameters
    ----------
    ngram_range : tuple
        Character n-gram range, e.g. (3, 4) or (3, 5).
    max_features : int
        Maximum number of TF-IDF features to keep.
    alpha : float
        Regularization strength for SGDClassifier.

    Returns
    -------
    model : Pipeline
        Sklearn pipeline with TF-IDF + SGD classifier.
    """
    tfidf = TfidfVectorizer(
        analyzer="char",
        ngram_range=ngram_range,
        min_df=5,                 # drop very rare n-grams to reduce noise
        max_features=max_features
    )

    clf = SGDClassifier(
        loss="log_loss",          # probabilistic linear classifier
        penalty="l2",
        alpha=alpha,
        max_iter=20,              # keep iterations moderate for speed
        tol=1e-3,
        n_jobs=-1,
        class_weight="balanced",  # help with label imbalance
        random_state=42
    )

    model = Pipeline([
        ("tfidf", tfidf),
        ("clf", clf),
    ])
    return model


# ------------------------------------------------------------------
# 4. Hyperparameter search (lightweight)
# ------------------------------------------------------------------

def hyperparameter_search(
    X_train_full,
    y_train_full,
    X_val,
    y_val,
    max_train_samples=200_000
):
    """
    Perform a small hyperparameter search on a subset of the training data
    to keep runtime manageable on a MacBook.

    Parameters
    ----------
    X_train_full : pd.Series
        Full training text data.
    y_train_full : np.ndarray
        Full training labels.
    X_val : pd.Series
        Validation text data.
    y_val : np.ndarray
        Validation labels.
    max_train_samples : int
        Maximum number of training samples to use during search.

    Returns
    -------
    best_cfg : dict
        Dictionary containing the best configuration (ngram, max_features, alpha).
    """
    print("\n================ HYPERPARAMETER SEARCH (light) ================\n")

    # Optionally subsample training data for speed
    if len(X_train_full) > max_train_samples:
        X_train_sub, y_train_sub = shuffle(
            X_train_full,
            y_train_full,
            random_state=42
        )
        X_train_sub = X_train_sub.iloc[:max_train_samples]
        y_train_sub = y_train_sub[:max_train_samples]
        print(f"Using a subsample of {max_train_samples} for hyperparameter search.")
    else:
        X_train_sub, y_train_sub = X_train_full, y_train_full
        print("Using full training set for hyperparameter search (size <= max_train_samples).")

    # Candidates: different n-grams and feature sizes
    configs = [
        {"name": "char_3_5_120k_alpha5e-5", "ngram": (3, 5), "max_features": 120_000, "alpha": 5e-5},
        {"name": "char_3_4_120k_alpha5e-5", "ngram": (3, 4), "max_features": 120_000, "alpha": 5e-5},
        {"name": "char_3_5_100k_alpha1e-4", "ngram": (3, 5), "max_features": 100_000, "alpha": 1e-4},
        {"name": "char_3_4_100k_alpha1e-4", "ngram": (3, 4), "max_features": 100_000, "alpha": 1e-4},
    ]

    results = []

    for cfg in configs:
        print(f"Training config: {cfg['name']}")
        model = build_pipeline(
            ngram_range=cfg["ngram"],
            max_features=cfg["max_features"],
            alpha=cfg["alpha"]
        )
        model.fit(X_train_sub, y_train_sub)

        val_pred = model.predict(X_val)
        acc = accuracy_score(y_val, val_pred)
        macro_f1 = f1_score(y_val, val_pred, average="macro")

        print(f"  Validation Accuracy : {acc:.4f}")
        print(f"  Validation Macro F1 : {macro_f1:.4f}\n")

        results.append((cfg, macro_f1, acc))

    print("===== Validation Results (sorted by Macro F1) =====")
    results_sorted = sorted(results, key=lambda x: x[1], reverse=True)
    for cfg, macro_f1, acc in results_sorted:
        print(f"{cfg['name']}: Macro F1={macro_f1:.4f}, Accuracy={acc:.4f}, "
              f"ngram={cfg['ngram']}, max_features={cfg['max_features']}, alpha={cfg['alpha']}")

    best_cfg = results_sorted[0][0]
    print(f"\nBest config selected: {best_cfg['name']}\n")
    return best_cfg


# ------------------------------------------------------------------
# 5. Train final model & evaluate
# ------------------------------------------------------------------

def train_final_model(
    X_train,
    y_train,
    X_val,
    y_val,
    best_cfg,
    model_path="subtaskB_sgd_balanced_model.joblib"
):
    """
    Train a final model on train + validation using the best hyperparameters.

    Parameters
    ----------
    X_train : pd.Series
        Training text data.
    y_train : np.ndarray
        Training labels.
    X_val : pd.Series
        Validation text data.
    y_val : np.ndarray
        Validation labels.
    best_cfg : dict
        Best hyperparameters chosen from hyperparameter_search.
    model_path : str
        File name for saving the trained model.
    """
    print("\n================ TRAINING FINAL MODEL ON TRAIN+VAL ================\n")

    full_X = pd.concat([X_train, X_val], axis=0)
    full_y = np.concatenate([y_train, y_val])

    model = build_pipeline(
        ngram_range=best_cfg["ngram"],
        max_features=best_cfg["max_features"],
        alpha=best_cfg["alpha"]
    )
    model.fit(full_X, full_y)

    joblib.dump(model, model_path)
    print(f"Final model saved as: {model_path}")

    return model


def evaluate_local(model, X_test_sample, y_test_sample):
    """
    Evaluate the model on test_sample.parquet (local only).
    This does NOT affect the Kaggle leaderboard; it's for sanity check.

    Parameters
    ----------
    model : Pipeline
        Trained model.
    X_test_sample : pd.Series
        Text data from test_sample.parquet.
    y_test_sample : np.ndarray
        Labels from test_sample.parquet.
    """
    print("\n================ EVALUATION ON TEST_SAMPLE (LOCAL ONLY) ================\n")

    ts_pred = model.predict(X_test_sample)
    acc = accuracy_score(y_test_sample, ts_pred)
    macro_f1 = f1_score(y_test_sample, ts_pred, average="macro")

    print(f"Test_sample Accuracy : {acc:.4f}")
    print(f"Test_sample Macro F1 : {macro_f1:.4f}\n")

    print("Classification report on test_sample:\n")
    print(classification_report(y_test_sample, ts_pred))


def generate_submission(model, X_test, submission_name="submission_sgd_balanced_new.csv"):
    """
    Generate a Kaggle submission file for Subtask B.

    Parameters
    ----------
    model : Pipeline
        Trained model.
    X_test : pd.Series
        Test text data from test.parquet (no labels).
    submission_name : str
        Output CSV file name.
    """
    print("\n================ GENERATING SUBMISSION FILE ================\n")

    test_pred = model.predict(X_test)

    sample_sub = pd.read_csv("sample_submission.csv")
    if len(sample_sub) != len(test_pred):
        raise ValueError(
            f"sample_submission length ({len(sample_sub)}) "
            f"does not match test predictions ({len(test_pred)})!"
        )

    sample_sub["label"] = test_pred.astype(int)
    sample_sub.to_csv(submission_name, index=False)

    print(f"Submission file created: {submission_name}")


# ------------------------------------------------------------------
# 6. Main execution
# ------------------------------------------------------------------

if __name__ == "__main__":
    set_working_directory()

    # Load data
    train_df, val_df, test_df, test_sample_df = load_datasets()

    # Build text inputs
    X_train = build_text_column(train_df)
    X_val = build_text_column(val_df)
    X_test = build_text_column(test_df)
    X_test_sample = build_text_column(test_sample_df)

    y_train = train_df["label"].values
    y_val = val_df["label"].values
    y_test_sample = test_sample_df["label"].values

    # Hyperparameter search (light, on subset for speed)
    best_cfg = hyperparameter_search(
        X_train_full=X_train,
        y_train_full=y_train,
        X_val=X_val,
        y_val=y_val,
        max_train_samples=200_000  # you can lower this if your Mac is slow
    )

    # Train final model
    final_model = train_final_model(
        X_train=X_train,
        y_train=y_train,
        X_val=X_val,
        y_val=y_val,
        best_cfg=best_cfg,
        model_path="subtaskB_sgd_balanced_model.joblib"
    )

    # Local evaluation on test_sample.parquet
    evaluate_local(
        model=final_model,
        X_test_sample=X_test_sample,
        y_test_sample=y_test_sample
    )

    # Generate submission file for Kaggle
    generate_submission(
        model=final_model,
        X_test=X_test,
        submission_name="submission_sgd_balanced_new.csv"
    )


Current working directory: /Users/shanuaimi/Desktop/Natural Language Processing /Task_B
Files in this folder: ['Untitled1.ipynb', 'train.parquet', '.DS_Store', 'test.parquet', 'test_sample.parquet', 'Untitled.ipynb', 'subtaskB_sgd_char_3_4_150k_alpha5e-5.joblib', 'submission_sgd_char_3_4_150k_alpha5e-5.csv', 'submission.csv', 'submission1.csv', 'subtaskB_model.joblib', 'validation.parquet', 'second trial.ipynb', '.ipynb_checkpoints', 'Fourth run.ipynb', 'sample_submission.csv']
Train size      : 500000
Validation size : 100000
Test size       : 1000
Test_sample size: 1000


Using a subsample of 200000 for hyperparameter search.
Training config: char_3_5_120k_alpha5e-5
  Validation Accuracy : 0.8710
  Validation Macro F1 : 0.2725

Training config: char_3_4_120k_alpha5e-5
  Validation Accuracy : 0.8699
  Validation Macro F1 : 0.2678

Training config: char_3_5_100k_alpha1e-4
  Validation Accuracy : 0.8694
  Validation Macro F1 : 0.2435

Training config: char_3_4_100k_alpha1e-4
  Validatio