In [17]:
# ==========================================
# Train ALL models (no model selection)
# - Train on Train
# - (Use Val for early stopping when available / report val metrics)
# - Test on Test and save classification reports
# ==========================================
import os
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

# XGBoost (optional)
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception:
    HAS_XGB = False


In [18]:

# -----------------------
# Load data
# -----------------------
TRAIN_CSV = "Dataset_60_20_20/train.csv"
VAL_CSV   = "Dataset_60_20_20/validation.csv"
TEST_CSV  = "Dataset_60_20_20/test.csv"

train_df = pd.read_csv(TRAIN_CSV)
val_df   = pd.read_csv(VAL_CSV)
test_df  = pd.read_csv(TEST_CSV)

for d in (train_df, val_df, test_df):
    d["Summary"] = d["Summary"].astype(str)
    d["Genre"]   = d["Genre"].astype(str)

X_train, y_train = train_df["Summary"], train_df["Genre"]
X_val,   y_val   = val_df["Summary"],   val_df["Genre"]
X_test,  y_test  = test_df["Summary"],  test_df["Genre"]


In [19]:

# -----------------------
# Config
# -----------------------
TOKEN_PATTERN = r'[\u0980-\u09FFA-Za-z0-9]+'  # Bangla + English tokens
MAX_FEATURES = 50000
OUT_DIR = "results_all_models"
os.makedirs(OUT_DIR, exist_ok=True)


In [20]:

# Feature configs you want to run (edit as needed)
tfidf_ngram_sets = {
    #"Unigram": (1, 1),
    #"Bigram": (2, 2),
    #"Trigram": (3, 3),
    "2+3gram": (2, 3),
}

In [21]:


# Models
models_tfidf = {
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "LinearSVM": LinearSVC(class_weight="balanced"),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    # GradientBoosting in sklearn can do early stopping via n_iter_no_change + validation_fraction
    "GradientBoosting": GradientBoostingClassifier(random_state=42, n_iter_no_change=5, validation_fraction=0.2),
}
if HAS_XGB:
    # We'll add early_stopping via eval_set at fit() time
    models_tfidf["XGBoost"] = XGBClassifier(
        eval_metric="mlogloss",
        random_state=42,
        n_jobs=-1,
        tree_method="hist"  # faster if GPU not used
    )


In [22]:

# -----------------------
# Helpers
# -----------------------
def save_report(prefix, y_true, y_pred, labels=None):
    acc = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average="macro")
    f1_weighted = f1_score(y_true, y_pred, average="weighted")
    rep = classification_report(y_true, y_pred, target_names=labels if labels else None, digits=4)

    out_txt = os.path.join(OUT_DIR, f"{prefix}_report.txt")
    with open(out_txt, "w", encoding="utf-8") as f:
        f.write(f"{prefix}\n")
        f.write(f"Accuracy   : {acc:.4f}\n")
        f.write(f"Macro F1   : {f1_macro:.4f}\n")
        f.write(f"Weighted F1: {f1_weighted:.4f}\n\n")
        f.write(rep)
    return acc, f1_macro, f1_weighted, out_txt


In [23]:

summary_rows = []

In [24]:

# -----------------------
# Run TF-IDF models for each TF-IDF n-gram
# -----------------------
for vname, ngr in tfidf_ngram_sets.items():
    for mname, clf in models_tfidf.items():
        name = f"{mname}-{vname}"
        print(name)
        # Build TF-IDF pipeline
        pipe = Pipeline([
            ("tfidf", TfidfVectorizer(token_pattern=TOKEN_PATTERN,
                                      ngram_range=ngr,
                                      max_features=MAX_FEATURES)),
            ("clf", clf)
        ])

        # Train on TRAIN
        if mname == "XGBoost" and HAS_XGB:
            # true early stopping via eval_set=validation
            pipe.named_steps["tfidf"].fit(X_train)  # fit vectorizer first
            Xtr_tfidf = pipe.named_steps["tfidf"].transform(X_train)
            Xval_tfidf = pipe.named_steps["tfidf"].transform(X_val)
            # Fit xgb with eval_set for ES
            pipe.named_steps["clf"].fit(
                Xtr_tfidf, y_train,
                eval_set=[(Xval_tfidf, y_val)],
                early_stopping_rounds=20,
                verbose=False
            )
        else:
            # For GradientBoosting, we already set n_iter_no_change + validation_fraction.
            # But sklearn's GB uses an internal split; here we just fit as usual.
            pipe.fit(X_train, y_train)

        # Validate
        y_val_pred = pipe.predict(X_val)
        val_acc, val_f1m, val_f1w, _ = save_report(f"{name}_VAL", y_val, y_val_pred, labels=sorted(y_train.unique()))

        # Test & save report
        y_test_pred = pipe.predict(X_test)
        test_acc, test_f1m, test_f1w, out_path = save_report(f"{name}_TEST", y_test, y_test_pred, labels=sorted(y_train.unique()))

        summary_rows.append([name, vname, "TFIDF", val_acc, val_f1m, val_f1w, test_acc, test_f1m, test_f1w, out_path])



LogisticRegression-2+3gram
LinearSVM-2+3gram




RandomForest-2+3gram
DecisionTree-2+3gram
GradientBoosting-2+3gram
XGBoost-2+3gram


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15], got ['Adventure' 'Biography and Autobiography' 'Classic Novel' 'Classic Story'
 'Contemporary Novel' 'Contemporary Story' 'Cooking, Food and Nutrition'
 'History and Tradition' 'Math' 'Mystery' 'Philosophy' 'Politics'
 'Religious' 'Sciene Fiction' 'Shishu Kishor' 'Thriller']

In [25]:
# -----------------------
# Summary table
# -----------------------
summary_df = pd.DataFrame(summary_rows, columns=[
    "Model", "FeatureSet", "Vectorizer",
    "Val_Acc", "Val_MacroF1", "Val_WeightedF1",
    "Test_Acc", "Test_MacroF1", "Test_WeightedF1",
    "ReportPath"
]).sort_values(["Test_WeightedF1", "Test_MacroF1", "Test_Acc"], ascending=False)

print("\n==== Summary (sorted by Test Weighted F1, then Macro F1, then Accuracy) ====")
print(summary_df.to_string(index=False))

# Save summary CSV
summary_csv = os.path.join(OUT_DIR, "summary_all_models_2+3gram.csv")
summary_df.to_csv(summary_csv, index=False, encoding="utf-8-sig")
print(f"\nSaved summary to: {summary_csv}")



==== Summary (sorted by Test Weighted F1, then Macro F1, then Accuracy) ====
                     Model FeatureSet Vectorizer  Val_Acc  Val_MacroF1  Val_WeightedF1  Test_Acc  Test_MacroF1  Test_WeightedF1                                                    ReportPath
         LinearSVM-2+3gram    2+3gram      TFIDF 0.512811     0.438515        0.504959  0.532755      0.445642         0.524868          results_all_models\LinearSVM-2+3gram_TEST_report.txt
LogisticRegression-2+3gram    2+3gram      TFIDF 0.494124     0.443307        0.492357  0.510790      0.445101         0.510672 results_all_models\LogisticRegression-2+3gram_TEST_report.txt
      RandomForest-2+3gram    2+3gram      TFIDF 0.444808     0.299444        0.402600  0.462042      0.301903         0.417296       results_all_models\RandomForest-2+3gram_TEST_report.txt
      DecisionTree-2+3gram    2+3gram      TFIDF 0.346947     0.257221        0.333140  0.348940      0.242507         0.333878       results_all_models\DecisionT