In [None]:
"""
Subtask C — Multi-class Authorship Classification

Classes:
0: Fully human-written
1: Fully machine-generated
2: Hybrid (partially human, partially LLM-generated)
3: Adversarial LLM output designed to mimic human style

Approach:
- Character-level TF-IDF
- SGDClassifier (logistic regression)
- Fast hyperparameter search on a subset of data
- Final retraining on full train + validation
- Generate submission file
"""

import os
import pandas as pd
import numpy as np
import time

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib

# 1. Set data directory
DATA_DIR = "/Users/marwa/Desktop/Task_C"

print("Using data folder:", DATA_DIR)
print("Files:", os.listdir(DATA_DIR))

# 2. Load dataset files
train = pd.read_parquet(os.path.join(DATA_DIR, "train.parquet"))
val   = pd.read_parquet(os.path.join(DATA_DIR, "validation.parquet"))
test  = pd.read_parquet(os.path.join(DATA_DIR, "test.parquet"))
sample_sub = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

print("Train:", train.shape)
print("Validation:", val.shape)
print("Test:", test.shape)


# 3. Inspect label distribution
print("\nUnique labels:", train["label"].unique())
print("Number of classes:", train["label"].nunique())

# 4. Build input text (language + code)
def make_text(df: pd.DataFrame) -> pd.Series:
    code = df["code"].fillna("")
    if "language" in df.columns:
        lang = df["language"].fillna("")
        return lang + " " + code
    return code

X_train = make_text(train)
X_val   = make_text(val)
X_test  = make_text(test)

y_train = train["label"].values
y_val   = val["label"].values

# 5. Model builder: TF-IDF + SGDClassifier
def build_model(
    ngram=(3, 4),
    max_features=200_000,
    alpha=5e-5,
):
    return Pipeline([
        ("tfidf", TfidfVectorizer(
            analyzer="char",
            ngram_range=ngram,
            max_features=max_features,
            min_df=10,
        )),
        ("clf", SGDClassifier(
            loss="log_loss",
            alpha=alpha,
            max_iter=10_000,
            tol=1e-3,
            n_jobs=-1,
            random_state=42,
        )),
    ])

# ================================================================
# 6. Fast hyperparameter search using subsets of data
# ================================================================

# Subset sizes for faster experimentation
N_TRAIN_HP = 200_000
N_VAL_HP   = 80_000

X_train_hp = X_train.iloc[:N_TRAIN_HP]
y_train_hp = y_train[:N_TRAIN_HP]
X_val_hp   = X_val.iloc[:N_VAL_HP]
y_val_hp   = y_val[:N_VAL_HP]

configs = [
    {"name": "3_4_200k_a5e-5",  "ngram": (3, 4), "max": 200_000, "alpha": 5e-5},
    {"name": "3_4_150k_a1e-4",  "ngram": (3, 4), "max": 150_000, "alpha": 1e-4},
    {"name": "3_5_150k_a5e-5",  "ngram": (3, 5), "max": 150_000, "alpha": 5e-5},
]

results = []

print("\nStarting hyperparameter search on subsets...\n")
print("Training subset size:", len(X_train_hp))
print("Validation subset size:", len(X_val_hp), "\n")

for cfg in configs:
    print("Testing configuration:", cfg["name"])
    t0 = time.time()

    model = build_model(
        ngram=cfg["ngram"],
        max_features=cfg["max"],
        alpha=cfg["alpha"],
    )
    model.fit(X_train_hp, y_train_hp)

    fit_time = time.time() - t0
    print("Training time (seconds):", round(fit_time, 1))

    val_pred = model.predict(X_val_hp)
    acc = accuracy_score(y_val_hp, val_pred)
    macro_f1 = f1_score(y_val_hp, val_pred, average="macro")

    print("Validation Accuracy:", round(acc, 4))
    print("Validation Macro F1:", round(macro_f1, 4), "\n")

    cfg["val_acc"] = acc
    cfg["val_f1"]  = macro_f1
    cfg["fit_time"] = fit_time
    results.append(cfg)

# Sort configurations by Macro F1
results = sorted(results, key=lambda d: d["val_f1"], reverse=True)
best = results[0]

print("\nHyperparameter search results (sorted):")
for r in results:
    print(
        r['name'],
        "| F1 =", round(r['val_f1'], 4),
        "| Accuracy =", round(r['val_acc'], 4),
        "| ngram =", r['ngram'],
        "| max_features =", r['max'],
        "| alpha =", r['alpha']
    )

print("\nBest configuration:", best)

# 7. Final training on full train + validation data
print("\nTraining final model on full dataset...\n")

X_full = pd.concat([X_train, X_val], axis=0)
y_full = np.concatenate([y_train, y_val])

final_model = build_model(
    ngram=best["ngram"],
    max_features=best["max"],
    alpha=best["alpha"],
)

t0 = time.time()
final_model.fit(X_full, y_full)
full_time = (time.time() - t0) / 60
print("Final training time (minutes):", round(full_time, 1))

model_file = f"subtaskC_model_{best['name']}.joblib"
model_path = os.path.join(DATA_DIR, model_file)
joblib.dump(final_model, model_path)
print("Model saved:", model_path)

# 8. Generate submission file
print("\nGenerating submission file...")

test_pred = final_model.predict(X_test)
sample_sub["label"] = test_pred

sub_name = f"submission_subtaskC_{best['name']}.csv"
sub_path = os.path.join(DATA_DIR, sub_name)
sample_sub.to_csv(sub_path, index=False)

print("Submission file created:", sub_path)


Using data folder: /Users/marwa/Desktop/Task_C
Files: ['train.parquet', 'test.parquet', 'test_sample.parquet', 'validation.parquet', 'sample_submission.csv']
Train: (900000, 4)
Validation: (200000, 4)
Test: (1000, 2)

Unique labels: [1 0 2 3]
Number of classes: 4

Starting hyperparameter search on subsets...

Training subset size: 200000
Validation subset size: 80000 

Testing configuration: 3_4_200k_a5e-5


python(79532) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Training time (seconds): 335.8
Validation Accuracy: 0.7022
Validation Macro F1: 0.4915 

Testing configuration: 3_4_150k_a1e-4
Training time (seconds): 302.4
Validation Accuracy: 0.6828
Validation Macro F1: 0.4528 

Testing configuration: 3_5_150k_a5e-5
Training time (seconds): 2616.8
Validation Accuracy: 0.7051
Validation Macro F1: 0.4968 


Hyperparameter search results (sorted):
3_5_150k_a5e-5 | F1 = 0.4968 | Accuracy = 0.7051 | ngram = (3, 5) | max_features = 150000 | alpha = 5e-05
3_4_200k_a5e-5 | F1 = 0.4915 | Accuracy = 0.7022 | ngram = (3, 4) | max_features = 200000 | alpha = 5e-05
3_4_150k_a1e-4 | F1 = 0.4528 | Accuracy = 0.6828 | ngram = (3, 4) | max_features = 150000 | alpha = 0.0001

Best configuration: {'name': '3_5_150k_a5e-5', 'ngram': (3, 5), 'max': 150000, 'alpha': 5e-05, 'val_acc': 0.7051125, 'val_f1': 0.49679401556902764, 'fit_time': 2616.839792728424}

Training final model on full dataset...



In [1]:
 import os

print(os.path.exists("model_subtaskC.joblib"))
if os.path.exists("model_subtaskC.joblib"):
    print("Size (MB):", os.path.getsize("model_subtaskC.joblib") / (1024**2))


False


In [7]:
BEST_CONFIG = {
    "ngram": (3, 5),
    "max_features": 120000,  # save because of memorey 
    "alpha": 5e-5,
}


In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
import joblib

# 1) Folder path
DATA_DIR = "/Users/marwa/Desktop/Task_C"

print("Using data folder:", DATA_DIR)
print("Files in folder:", os.listdir(DATA_DIR))

# 2) Load train + validation
print("Loading train and validation data...")
train = pd.read_parquet(os.path.join(DATA_DIR, "train.parquet"))
val = pd.read_parquet(os.path.join(DATA_DIR, "validation.parquet"))

print("Train shape:", train.shape)
print("Validation shape:", val.shape)

# Combine train + validation into one DataFrame
full_df = pd.concat([train, val], axis=0).reset_index(drop=True)
print("Full training data shape:", full_df.shape)

# Features (code text) and labels
X_text = full_df["code"]
y = full_df["label"]

# 3) TF-IDF vectorizer

# Char-level TF-IDF with n-grams 3 to 5 and 150k features
vectorizer = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),
    max_features=150000,
    dtype="float32",  # will be converted to float64 internally
)

print("Fitting TF-IDF on full dataset...")
X = vectorizer.fit_transform(X_text)

# 4) Classifier (SGD with log-loss)
clf = SGDClassifier(
    loss="log_loss",
    alpha=5e-5,
    n_jobs=-1,
    random_state=42,
)

print("Training classifier on full dataset...")
clf.fit(X, y)

# 5) Save model (vectorizer + classifier)
bundle = {
    "vectorizer": vectorizer,
    "classifier": clf,
}

MODEL_PATH = os.path.join(DATA_DIR, "model_subtaskC_best.joblib")
joblib.dump(bundle, MODEL_PATH)

print("✅ Done. Model saved to:", MODEL_PATH)


Using data folder: /Users/marwa/Desktop/Task_C
Files in folder: ['train.parquet', 'test.parquet', 'test_sample.parquet', 'validation.parquet', 'sample_submission.csv']
Loading train and validation data...
Train shape: (900000, 4)
Validation shape: (200000, 4)
Full training data shape: (1100000, 4)
Fitting TF-IDF on full dataset...


