In [19]:
"""
Subtask A — TF-IDF + SGDClassifier (Full Version)
------------------------------------------------
• Binary authorship attribution (0 = human, 1 = machine)
• Uses train/validation/test/test_sample/sample_submission from Task_A folder
• Hyperparameter search on validation set
• Retrain final model on train+val
• Save model + generate submission.csv
"""

import os
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib

# ================================================================
# 1. Folder containing the files on your Desktop
# ================================================================
DATA_DIR = "/Users/marwa/Desktop/Task_A"   # <--   Task_A Folder

print("Using data folder:", DATA_DIR)
print("Files in folder:", os.listdir(DATA_DIR))  # Check Files

# ================================================================
# 2. Load data from competition files
#   train.parquet, validation.parquet, test.parquet,
#   test_sample.parquet, sample_submission.csv
# ================================================================
train = pd.read_parquet(os.path.join(DATA_DIR, "train.parquet"))
val = pd.read_parquet(os.path.join(DATA_DIR, "validation.parquet"))
test = pd.read_parquet(os.path.join(DATA_DIR, "test.parquet"))
test_sample = pd.read_parquet(os.path.join(DATA_DIR, "test_sample.parquet"))
sample_sub = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

print("Train:", train.shape)
print("Val:", val.shape)
print("Test:", test.shape)
print("Test_sample:", test_sample.shape)

# ================================================================
# 3. Ensure labels are ints (binary: 0 = human, 1 = machine)
# ================================================================
train["label"] = train["label"].astype(int)
val["label"] = val["label"].astype(int)
test_sample["label"] = test_sample["label"].astype(int)

# ================================================================
# 4. Build text (language + code)
# ================================================================
def make_text(df: pd.DataFrame) -> pd.Series:
    code = df["code"].fillna("")
    if "language" in df.columns:
        lang = df["language"].fillna("")
        return lang + " " + code
    return code

X_train = make_text(train)
X_val   = make_text(val)
X_test  = make_text(test)
X_ts    = make_text(test_sample)

y_train = train["label"].values
y_val   = val["label"].values
y_ts    = test_sample["label"].values

# ================================================================
# 5. Pipeline builder
# ================================================================
def build_model(
    ngram=(3, 5),
    max_features=200_000,
    alpha=1e-4,
    loss="hinge"
) -> Pipeline:
    return Pipeline([
        ("tfidf", TfidfVectorizer(
            analyzer="char",
            ngram_range=ngram,
            max_features=max_features,
            min_df=10,
        )),
        ("clf", SGDClassifier(
            loss=loss,              # "hinge" = linear SVM
            alpha=alpha,
            max_iter=10_000,
            tol=1e-3,
            n_jobs=-1,
            class_weight="balanced",
            random_state=42,
        )),
    ])

# ================================================================
# 6. Hyperparameter search on validation set
# ================================================================
configs = [
    {"name": "3_5_200k_a1e-4", "ngram": (3, 5), "max": 200_000, "alpha": 1e-4},
    {"name": "3_5_150k_a1e-4", "ngram": (3, 5), "max": 150_000, "alpha": 1e-4},
    {"name": "3_4_150k_a5e-5", "ngram": (3, 4), "max": 150_000, "alpha": 5e-5},
    {"name": "3_5_200k_a5e-4", "ngram": (3, 5), "max": 200_000, "alpha": 5e-4},
]

results = []

print("\n================ HYPERPARAMETER SEARCH ==================\n")
for cfg in configs:
    print("Training config:", cfg["name"])

    model = build_model(
        ngram=cfg["ngram"],
        max_features=cfg["max"],
        alpha=cfg["alpha"],
    )
    model.fit(X_train, y_train)

    val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, val_pred)
    macro_f1 = f1_score(y_val, val_pred, average="macro")

    print(f"  Val Accuracy : {acc:.4f}")
    print(f"  Val Macro F1 : {macro_f1:.4f}\n")

    cfg["val_acc"] = acc
    cfg["val_f1"] = macro_f1
    results.append(cfg)

# Sort configs by macro F1 (descending)
results = sorted(results, key=lambda d: d["val_f1"], reverse=True)
best = results[0]

print("===== Validation Results (sorted by Macro F1) =====")
for r in results:
    print(
        f"{r['name']}: Macro F1={r['val_f1']:.4f}, "
        f"Accuracy={r['val_acc']:.4f}, "
        f"ngram={r['ngram']}, max_features={r['max']}, alpha={r['alpha']}"
    )

print("\nBest config selected:", best)

# ================================================================
# 7. Retrain final model on (train + validation)
# ================================================================
print("\n================ FINAL TRAINING ON TRAIN+VAL ==================\n")

X_full = pd.concat([X_train, X_val], axis=0)
y_full = np.concatenate([y_train, y_val])

final_model = build_model(
    ngram=best["ngram"],
    max_features=best["max"],
    alpha=best["alpha"],
)

final_model.fit(X_full, y_full)

model_filename = f"subtaskA_sgd_{best['name']}.joblib"
model_path = os.path.join(DATA_DIR, model_filename)
joblib.dump(final_model, model_path)
print(f"Final model saved as: {model_path}")

# ================================================================
# 8. Evaluate on test_sample (local diagnostics)
# ================================================================
print("\n================ TEST_SAMPLE EVALUATION ==================\n")

ts_pred = final_model.predict(X_ts)
ts_acc = accuracy_score(y_ts, ts_pred)
ts_macro_f1 = f1_score(y_ts, ts_pred, average="macro")

print(f"Test_sample Accuracy : {ts_acc:.4f}")
print(f"Test_sample Macro F1 : {ts_macro_f1:.4f}\n")
print("Classification report on test_sample:\n")
print(classification_report(y_ts, ts_pred))

# ================================================================
# 9. Predict on real test set and create submission file
# ================================================================
print("\n================ GENERATING SUBMISSION FILE ==================\n")

test_pred = final_model.predict(X_test)
sample_sub["label"] = test_pred

submission_name = f"submission_subtaskA_{best['name']}.csv"
submission_path = os.path.join(DATA_DIR, submission_name)
sample_sub.to_csv(submission_path, index=False)

print(f"Submission file created: {submission_path}")


Using data folder: /Users/marwa/Desktop/Task_A
Files in folder: ['train.parquet', 'test.parquet', 'test_sample.parquet', 'validation.parquet', 'sample_submission.csv']
Train: (500000, 4)
Val: (100000, 4)
Test: (1000, 2)
Test_sample: (1000, 4)


Training config: 3_5_200k_a1e-4
  Val Accuracy : 0.9326
  Val Macro F1 : 0.9326

Training config: 3_5_150k_a1e-4
  Val Accuracy : 0.9330
  Val Macro F1 : 0.9329

Training config: 3_4_150k_a5e-5
  Val Accuracy : 0.9409
  Val Macro F1 : 0.9408

Training config: 3_5_200k_a5e-4
  Val Accuracy : 0.8969
  Val Macro F1 : 0.8969

===== Validation Results (sorted by Macro F1) =====
3_4_150k_a5e-5: Macro F1=0.9408, Accuracy=0.9409, ngram=(3, 4), max_features=150000, alpha=5e-05
3_5_150k_a1e-4: Macro F1=0.9329, Accuracy=0.9330, ngram=(3, 5), max_features=150000, alpha=0.0001
3_5_200k_a1e-4: Macro F1=0.9326, Accuracy=0.9326, ngram=(3, 5), max_features=200000, alpha=0.0001
3_5_200k_a5e-4: Macro F1=0.8969, Accuracy=0.8969, ngram=(3, 5), max_features=200000, a