In [9]:
import re
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from scipy.special import expit
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.metrics import (accuracy_score,confusion_matrix, f1_score, roc_auc_score, roc_curve, auc)
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict

warnings.filterwarnings("ignore", category=FutureWarning)

warnings.filterwarnings("ignore", category=FutureWarning)


URL_PATTERN = re.compile(r"https?://\S+|www\.\S+")
HTML_PATTERN = re.compile(r"<.*?>")
ALLOWED_CHARS = re.compile(r"[^a-z0-9\.\!\?\'\s]")
CONTRACTIONS = {"can't":"cannot","won't":"will not","n't":" not","'re":" are","'s":" is","'d":" would","'ll":" will","'ve":" have","'m":" am"}
NEG_CUES = {"not","never","no","n't","cannot"}
STOPWORDS = {"the","and","a","an","of","in","on","at","to","is","it","this","that","i","you","he","she","they","we","was","were","be","been","am","are","but","if","or","as","with","for","not","no","so","too","very"}

def expand_contractions(t):
    for p,r in CONTRACTIONS.items():
        t = re.sub(p,r,t)
    return t

def basic_clean(t):
    t = expand_contractions(t.lower())
    t = HTML_PATTERN.sub(" ",t)
    t = URL_PATTERN.sub(" ",t)
    t = ALLOWED_CHARS.sub(" ",t)
    return re.sub(r"\s+"," ",t).strip()

def clean_series(s):
    return s.astype(str).fillna("").map(basic_clean)

def mark_negations(s):
    out = []
    for txt in s:
        neg = False
        buf = []
        for tok in txt.split():
            if tok in NEG_CUES:
                buf.append(f"{tok}_NEG")
                neg = True
            elif neg and re.fullmatch(r"[a-z0-9']+",tok):
                buf.append(f"{tok}_NEG")
            else:
                buf.append(tok)
                if re.search(r"[\.\!\?]$",tok):
                    neg = False
        out.append(" ".join(buf))
    return pd.Series(out,index=s.index)

def remove_stopwords(s):
    return s.map(lambda t:" ".join(w for w in t.split() if len(w)>1 and w not in STOPWORDS))

def show_cv_table(res, cols):
    df = pd.DataFrame(res)
    param_cols = [f"param_{c}" for c in cols]
    tbl = df[param_cols + ["mean_test_score", "std_test_score"]]
    tbl = tbl.rename(columns=dict(zip(param_cols, cols)))
    display(tbl.sort_values("mean_test_score", ascending=False))


def evaluate_model(y_true,y_pred,y_prob,name):
    print(name)
    print("accuracy",accuracy_score(y_true,y_pred))
    print("f1_macro",f1_score(y_true,y_pred,average="macro"))
    cm = confusion_matrix(y_true,y_pred)
    plt.figure(figsize=(4,4)); plt.imshow(cm,cmap="Blues"); plt.xticks([0,1]); plt.yticks([0,1])
    for i in range(2):
        for j in range(2):
            plt.text(j,i,str(cm[i,j]),ha="center",va="center",color="white" if cm[i,j]>cm.max()/2 else "black")
    plt.show()
    fpr,tpr,_ = roc_curve(y_true,y_prob)
    plt.figure(); plt.plot(fpr,tpr,label="AUC="+str(auc(fpr,tpr))); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.legend(); plt.show()

def run_cv(build_fn,X,y):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    pipes,grids = build_fn()
    best_models = {}
    scores = {}
    for name,pipe in pipes.items():
        gs = GridSearchCV(pipe,grids[name],cv=cv,scoring="f1_macro",n_jobs=-1,verbose=1)
        gs.fit(X,y)
        show_cv_table(gs.cv_results_,list(grids[name].keys()))
        best_models[name] = gs.best_estimator_
        if name == "SVM":
            scr = cross_val_predict(best_models[name],X,y,cv=cv,method="decision_function")
            preds = (scr >= 0).astype(int)
            probs = expit(scr)
        else:
            preds = cross_val_predict(best_models[name],X,y,cv=cv,method="predict")
            probs = cross_val_predict(best_models[name],X,y,cv=cv,method="predict_proba")[:,1]
        evaluate_model(y,preds,probs,name)
        scores[name] = f1_score(y,preds,average="macro")
    best_name = max(scores,key=scores.get)
    return best_models[best_name]

In [None]:
def load_data_fp():
    tr = pd.read_csv("train1.csv").drop_duplicates()
    te = pd.read_csv("test1.csv").drop_duplicates()
    tr["combined"] = tr["reviewText"].fillna("")+" "+tr["summary"].fillna("")
    te["combined"] = te["reviewText"].fillna("")+" "+te["summary"].fillna("")
    tr = tr[tr["combined"].str.len()>10].reset_index(drop=True)
    X_tr = remove_stopwords(mark_negations(clean_series(tr["combined"])))
    X_te = remove_stopwords(mark_negations(clean_series(te["combined"])))
    y = (tr["overall"]>1).astype(int).values
    ids = te["id"].tolist()
    return X_tr.tolist(), X_te.tolist(), y, ids

def build_pipelines_fp():
    cnt = CountVectorizer(max_features=10000,ngram_range=(1,2),stop_words="english",min_df=3,max_df=0.7,binary=True)
    tfd = TfidfVectorizer(max_features=10000,ngram_range=(1,2),stop_words="english",min_df=3,max_df=0.7,sublinear_tf=True)
    pipe_nb = Pipeline([("vect",cnt),("clf",MultinomialNB())])
    pipe_lr = Pipeline([("vect",tfd),("clf",LogisticRegression(class_weight="balanced",solver="liblinear",max_iter=300))])
    pipe_svm = Pipeline([("vect",TfidfVectorizer(stop_words="english",lowercase=True,max_df=0.8,min_df=3,sublinear_tf=True,ngram_range=(1,2),max_features=10000)),("clf",LinearSVC(class_weight="balanced",max_iter=3000))])
    grids = {"NB":{"clf__alpha":[0.1,1]},"LR":{"clf__C":[0.5,1]},"SVM":{"clf__C":[0.5,1]}}
    return {"NB":pipe_nb,"LR":pipe_lr,"SVM":pipe_svm},grids

if __name__ == "__main__":
    X_tr,X_te,y,ids = load_data_fp()
    best_model = run_cv(build_pipelines_fp,X_tr,y)
    preds = best_model.predict(X_te)
    pd.DataFrame({"id":ids,"overall":preds}).to_csv("text1_output1.csv",index=False)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [None]:
def load_data_mid():
    tr = pd.read_csv("train1.csv").drop_duplicates()
    te = pd.read_csv("test1.csv").drop_duplicates()
    for df in (tr,te):
        df["reviewText"] = mark_negations(clean_series(df["reviewText"]))
        df["summary"] = mark_negations(clean_series(df["summary"]))
        df["category"] = df["category"].astype(str).fillna("")
        df["verified"] = df["verified"].fillna(False).astype(int)
        df["vote"] = pd.to_numeric(df["vote"].astype(str).str.replace(",",""),errors="coerce").fillna(0.0)
    y = (tr["overall"]>1).astype(int).values
    ids = te["id"].tolist()
    return tr.drop(columns=["overall"]), te, y, ids

def preprocessor_mid():
    tfidf_review = TfidfVectorizer(max_features=60000,ngram_range=(1,2),stop_words="english",min_df=3,sublinear_tf=True)
    tfidf_summary = TfidfVectorizer(max_features=15000,ngram_range=(1,1),stop_words="english",min_df=2,sublinear_tf=True)
    tfidf_category = TfidfVectorizer(analyzer="char",ngram_range=(3,5),min_df=1)
    return ColumnTransformer([
        ("review",tfidf_review,"reviewText"),
        ("summary",tfidf_summary,"summary"),
        ("category",tfidf_category,"category"),
        ("verified",OneHotEncoder(handle_unknown="ignore"),["verified"]),
        ("vote",FunctionTransformer(func=np.log1p,validate=False,feature_names_out="one-to-one"),["vote"])
    ],remainder="drop",sparse_threshold=0.3)

def build_pipelines_mid():
    feat = preprocessor_mid()
    pipe_nb = Pipeline([("feat",feat),("clf",MultinomialNB())])
    pipe_lr = Pipeline([("feat",feat),("clf",LogisticRegression(class_weight="balanced",solver="liblinear",max_iter=2000))])
    pipe_svm = Pipeline([("feat",feat),("clf",LinearSVC(class_weight="balanced",max_iter=3000))])
    grids = {"NB":{"clf__alpha":[0.1,0.5,1.0]},"LR":{"clf__C":[0.5,1.0,2.0]},"SVM":{"clf__C":[0.5,1.0,2.0]}}
    return {"NB":pipe_nb,"LR":pipe_lr,"SVM":pipe_svm},grids

if __name__ == "__main__":
    X_tr,X_te,y,ids = load_data_mid()
    best_model = run_cv(build_pipelines_mid,X_tr,y)
    preds = best_model.predict(X_te)
    pd.DataFrame({"id":ids,"overall":preds}).to_csv("test1_output2.csv",index=False)

In [None]:
def load_data_last():
    tr = pd.read_csv("train1.csv").drop_duplicates()
    te = pd.read_csv("test1.csv").drop_duplicates()
    for df in (tr, te):
        df["reviewText"] = mark_negations(clean_series(df["reviewText"]))
        df["summary"] = mark_negations(clean_series(df["summary"]))
        df["category"] = df["category"].astype(str).fillna("")
    y = (tr["overall"] > 1).astype(int).values
    ids = te["id"].tolist()
    X_tr = tr[["reviewText", "summary", "category"]].copy()
    X_te = te[["reviewText", "summary", "category"]].copy()
    return X_tr, X_te, y, ids

def vectoriser_last():
    return ColumnTransformer(
        transformers=[
            ("review", TfidfVectorizer(max_features=60000, ngram_range=(1, 2), stop_words="english", min_df=3, sublinear_tf=True), "reviewText"),
            ("summary", TfidfVectorizer(max_features=15000, ngram_range=(1, 1), stop_words="english", min_df=2, sublinear_tf=True), "summary"),
            ("category", TfidfVectorizer(analyzer="char", ngram_range=(3, 5), min_df=1), "category"),
        ],
        remainder="drop",
        sparse_threshold=0.3
    )

def build_pipelines_last():
    vec = vectoriser_last()
    pipe_nb = Pipeline([("vec", vec), ("clf", MultinomialNB())])
    pipe_lr = Pipeline([("vec", vec), ("clf", LogisticRegression(class_weight="balanced", solver="liblinear", max_iter=2000))])
    pipe_svm = Pipeline([("vec", vec), ("clf", LinearSVC(class_weight="balanced", max_iter=3000))])
    grids = {
        "NB": {"clf__alpha": [0.1, 0.5, 1.0]},
        "LR": {"clf__C": [0.5, 1.0, 2.0]},
        "SVM": {"clf__C": [0.5, 1.0, 2.0]}
    }
    return {"NB": pipe_nb, "LR": pipe_lr, "SVM": pipe_svm}, grids

if __name__ == "__main__":
    X_tr, X_te, y, ids = load_data_last()
    best_model = run_cv(build_pipelines_last, X_tr, y)
    preds = best_model.predict(X_te)
    pd.DataFrame({"id": ids, "overall": preds}).to_csv("test1_output3.csv", index=False)

In [3]:
import warnings
import pandas as pd
import numpy as np
import scipy.sparse as sp
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, accuracy_score, classification_report, roc_curve, auc
from sklearn.preprocessing import label_binarize


warnings.filterwarnings("ignore", category=FutureWarning)

def load_data_mc():
    tr = pd.read_csv("train5.csv").drop_duplicates().reset_index(drop=True)
    te = pd.read_csv("test5.csv").drop_duplicates()
    tr["combined"] = tr["reviewText"].fillna("") + " " + tr["summary"].fillna("")
    te["combined"] = te["reviewText"].fillna("") + " " + te["summary"].fillna("")
    tr = tr[tr["combined"].str.len() > 10].reset_index(drop=True)
    X_tr = remove_stopwords(mark_negations(clean_series(tr["combined"])))
    X_te = remove_stopwords(mark_negations(clean_series(te["combined"])))
    y = tr["overall"].astype(int).values
    ids = te["id"].tolist()
    return X_tr.tolist(), X_te.tolist(), y, ids

def vectorisers_mc():
    cnt = CountVectorizer(max_features=10000, ngram_range=(1,2), stop_words="english", min_df=3, max_df=0.7, binary=True)
    tfd = TfidfVectorizer(max_features=10000, ngram_range=(1,2), stop_words="english", min_df=3, max_df=0.7, sublinear_tf=True)
    tfidf_svm = TfidfVectorizer(stop_words="english", lowercase=True, max_df=0.8, min_df=3, sublinear_tf=True, ngram_range=(1,2), max_features=10000)
    return cnt, tfd, tfidf_svm

def build_pipelines_mc():
    cnt, tfd, tfidf_svm = vectorisers_mc()
    pipe_nb  = Pipeline([("vect", cnt), ("clf", MultinomialNB())])
    pipe_lr  = Pipeline([("vect", tfd), ("clf", LogisticRegression(class_weight="balanced", solver="lbfgs", multi_class="multinomial", max_iter=500))])
    pipe_svm = Pipeline([
        ("tfidf", tfidf_svm),
        ("select", SelectKBest(chi2, k=6000)),
        ("calib", CalibratedClassifierCV(estimator=LinearSVC(class_weight="balanced", max_iter=5000), method="sigmoid", cv=5, n_jobs=-1))
    ])
    grids = {"NB": {"clf__alpha": [0.1, 1.0]}, "LR": {"clf__C": [0.5, 1.0]}, "SVM": {"calib__estimator__C": [0.5, 1.0]}}
    return {"NB": pipe_nb, "LR": pipe_lr, "SVM": pipe_svm}, grids

def evaluate_model_mc(y_true, y_pred, y_score, name):
    acc = accuracy_score(y_true, y_pred); f1 = f1_score(y_true, y_pred, average="macro")
    classes = np.unique(y_true); cm = confusion_matrix(y_true, y_pred, labels=classes)
    plt.figure(figsize=(4,4)); plt.imshow(cm, cmap="Blues"); plt.xticks(range(len(classes)), classes); plt.yticks(range(len(classes)), classes)
    for i in range(len(classes)):
        for j in range(len(classes)):
            plt.text(j, i, str(cm[i,j]), ha="center", va="center", color="white" if cm[i,j] > cm.max()/2 else "black")
    plt.title(f"{name} Confusion Matrix"); plt.show()
    y_bin = label_binarize(y_true, classes=classes); fpr, tpr, _ = roc_curve(y_bin.ravel(), y_score.ravel())
    auc_val = roc_auc_score(y_true, y_score, multi_class="ovr", average="macro")
    plt.figure(); plt.plot(fpr, tpr, label=f"AUC={auc_val:.4f}"); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.legend(); plt.title(f"{name} ROC"); plt.show()
    print(name, "accuracy", acc, "f1_macro", f1, "roc_auc_macro", auc_val); return f1

def run_holdout(build_fn, X, y):
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    pipes, grids = build_fn(); best_models = {}; scores = {}
    for name, pipe in pipes.items():
        gs = GridSearchCV(pipe, grids[name], cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring="f1_macro", n_jobs=-1, verbose=1)
        gs.fit(X_tr, y_tr); show_cv_table(gs.cv_results_, list(grids[name].keys()))
        best_models[name] = gs.best_estimator_
        y_pred = best_models[name].predict(X_val); y_proba = best_models[name].predict_proba(X_val)
        scores[name] = evaluate_model_mc(y_val, y_pred, y_proba, name)
    return best_models[max(scores, key=scores.get)]

if __name__ == "__main__":
    X_tr, X_te, y, ids = load_data_mc()
    best_model = run_holdout(build_pipelines_mc, X_tr, y)
    best_model.fit(X_tr, y)
    preds = best_model.predict(X_te)
    pd.DataFrame({"id": ids, "overall": preds}).to_csv("text1_output4_mc.csv", index=False)


NameError: name 'remove_stopwords' is not defined

In [None]:


train = pd.read_csv("train5.csv").drop_duplicates().reset_index(drop=True)
test = pd.read_csv("test5.csv").reset_index(drop=True)

for df in (train, test):
    df["reviewText"] = df["reviewText"].fillna("").astype(str)
    df["summary"] = df["summary"].fillna("").astype(str)
    df["combined"] = df["reviewText"] + " " + df["summary"]

train = train[train["combined"].str.len() > 10].reset_index(drop=True)

def preprocess_text(s):
    return remove_stopwords(mark_negations(clean_series(s))).tolist()

train_texts = preprocess_text(train["combined"])
test_texts = preprocess_text(test["combined"])
y_train = train["overall"].astype(int).values
test_ids = test["id"].tolist()

count_vect = CountVectorizer(
    max_features=10000, ngram_range=(1,2),
    stop_words="english", min_df=3, max_df=0.7, binary=True
)
X_count = count_vect.fit_transform(train_texts)
Xc_test = count_vect.transform(test_texts)

word_tfidf = TfidfVectorizer(
    analyzer="word", ngram_range=(1,2),
    stop_words="english", min_df=3, max_df=0.7,
    max_features=20000, sublinear_tf=True
)
char_tfidf = TfidfVectorizer(
    analyzer="char", ngram_range=(3,5),
    min_df=3, max_df=0.7, max_features=30000, sublinear_tf=True
)
X_train = sp.hstack([
    word_tfidf.fit_transform(train_texts),
    char_tfidf.fit_transform(train_texts)
]).tocsr()
X_test = sp.hstack([
    word_tfidf.transform(test_texts),
    char_tfidf.transform(test_texts)
]).tocsr()

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = {}

def evaluate_metrics(name, y_true, y_pred, y_score):
    acc = accuracy_score(y_true, y_pred)
    f1m = f1_score(y_true, y_pred, average="macro")
    roc = roc_auc_score(y_true, y_score, multi_class="ovr", average="macro")
    print(f"\n{name}  ACC:{acc:.4f}  F1:{f1m:.4f}  AUC:{roc:.4f}")
    print(classification_report(y_true, y_pred, digits=4))
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(4,4))
    plt.imshow(cm, cmap="Blues")
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, cm[i,j],
                     ha="center", va="center",
                     color="white" if cm[i,j] > cm.max()/2 else "black")
    plt.xlabel("Pred"); plt.ylabel("True"); plt.show()
    yb = pd.get_dummies(y_true).values
    plt.figure()
    for i in range(yb.shape[1]):
        fpr, tpr, _ = roc_curve(yb[:,i], y_score[:,i])
        plt.plot(fpr, tpr, label=f"{name} C{i}(AUC={auc(fpr,tpr):.2f})")
    plt.plot([0,1], [0,1], "k--"); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.legend(); plt.show()
    return {"accuracy": acc, "f1_macro": f1m, "roc_auc": roc}

nb = MultinomialNB()
g_nb = GridSearchCV(nb, {"alpha":[0.1,1.0]}, cv=cv, scoring="f1_macro", n_jobs=-1, verbose=1)
g_nb.fit(X_count, y_train)
b_nb = g_nb.best_estimator_
p_nb = cross_val_predict(b_nb, X_count, y_train, cv=cv)
pr_nb = cross_val_predict(b_nb, X_count, y_train, cv=cv, method="predict_proba")
results["NB"] = evaluate_metrics("NB", y_train, p_nb, pr_nb)

lr = LogisticRegression(
    multi_class="multinomial", solver="lbfgs",
    class_weight="balanced", max_iter=500
)
g_lr = GridSearchCV(lr, {"C":[0.5,1.0]}, cv=cv, scoring="f1_macro", n_jobs=-1, verbose=1)
g_lr.fit(X_train, y_train)
b_lr = g_lr.best_estimator_
p_lr = cross_val_predict(b_lr, X_train, y_train, cv=cv)
pr_lr = cross_val_predict(b_lr, X_train, y_train, cv=cv, method="predict_proba")
results["LR"] = evaluate_metrics("LR", y_train, p_lr, pr_lr)

svm_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english", lowercase=True,
        max_df=0.8, min_df=3, sublinear_tf=True,
        ngram_range=(1,2), max_features=10000
    )),
    ("select", SelectKBest(chi2, k=6000)),
    ("calib", CalibratedClassifierCV(
        estimator=LinearSVC(class_weight="balanced", max_iter=5000),
        method="sigmoid", cv=5, n_jobs=-1
    ))
])
g_svm = GridSearchCV(svm_pipe, {"calib__estimator__C":[0.5,1.0]}, cv=cv, scoring="f1_macro", n_jobs=-1, verbose=1)
g_svm.fit(train_texts, y_train)
b_svm = g_svm.best_estimator_
p_svm = cross_val_predict(b_svm, train_texts, y_train, cv=cv, method="predict")
pr_svm = cross_val_predict(b_svm, train_texts, y_train, cv=cv, method="predict_proba")
results["SVM"] = evaluate_metrics("SVM", y_train, p_svm, pr_svm)

best = max(results, key=lambda k: results[k]["f1_macro"])
print(f"Best:{best} F1={results[best]['f1_macro']:.4f}")

if best == "NB":
    final = b_nb.predict(Xc_test)
elif best == "LR":
    final = b_lr.predict(X_test)
else:
    final = b_svm.predict(test_texts)

pd.DataFrame({"id": test_ids, "overall": final}).to_csv("submission_multiclass_full_metrics.csv", index=False)


In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

TEXT_COLS = ["reviewText", "summary"]
LABEL_COL = "category"
N_COMPONENTS = 120
N_INIT = 20
RANDOM_STATE = 42
MIN_DF = 3
MAX_FEATURES = 65000
NGRAM_RANGE = (1, 2)
BASELINE_SIL = 0.050

train_df = shuffle(pd.read_csv("train5.csv"), random_state=RANDOM_STATE).reset_index(drop=True)
test_df = shuffle(pd.read_csv("test5.csv"), random_state=RANDOM_STATE).reset_index(drop=True)

train_combined = train_df[TEXT_COLS].fillna("").agg(" ".join, axis=1)
test_combined = test_df[TEXT_COLS].fillna("").agg(" ".join, axis=1)

train_texts = remove_stopwords(mark_negations(clean_series(train_combined)))
test_texts = remove_stopwords(mark_negations(clean_series(test_combined)))

steps = [
    ("tfidf", TfidfVectorizer(
        lowercase=False,
        stop_words="english",
        min_df=MIN_DF,
        max_features=MAX_FEATURES,
        ngram_range=NGRAM_RANGE,
        sublinear_tf=True
    ))
]

if N_COMPONENTS:
    steps.append(("svd", TruncatedSVD(n_components=N_COMPONENTS, random_state=RANDOM_STATE)))

vector_pipe = Pipeline(steps)

X_train = vector_pipe.fit_transform(train_texts)
X_test = vector_pipe.transform(test_texts)

le = LabelEncoder()
y_test = le.fit_transform(test_df[LABEL_COL])

n_clusters = len(le.classes_)
print(f"Clustering into {n_clusters} clusters.")

km = KMeans(
    n_clusters=n_clusters,
    n_init=N_INIT,
    max_iter=300,
    random_state=RANDOM_STATE,
    verbose=0
)

pred_clusters = km.fit_predict(X_test)

sil = silhouette_score(X_test, pred_clusters, metric="cosine")
ari = adjusted_rand_score(y_test, pred_clusters)

print("Results:")
print(f"Silhouette score (cosine): {sil:.4f}")
print(f"Adjusted Rand Index:       {ari:.4f}")
print("Passed the baseline silhouette threshold!" if sil >= BASELINE_SIL else "Below the baseline silhouette threshold.")

if __name__ == "__main__":
    crosstab = pd.crosstab(pred_clusters, y_test, rownames=["Cluster"], colnames=["True Label"])
    print("Cluster ↔ True-label contingency:")
    print(crosstab.head(10))


In [6]:
import re
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from scipy.special import expit
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.metrics import (accuracy_score,confusion_matrix, f1_score, roc_auc_score, roc_curve, auc)
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict

warnings.filterwarnings("ignore", category=FutureWarning)

warnings.filterwarnings("ignore", category=FutureWarning)


URL_PATTERN = re.compile(r"https?://\S+|www\.\S+")
HTML_PATTERN = re.compile(r"<.*?>")
ALLOWED_CHARS = re.compile(r"[^a-z0-9\.\!\?\'\s]")
CONTRACTIONS = {"can't":"cannot","won't":"will not","n't":" not","'re":" are","'s":" is","'d":" would","'ll":" will","'ve":" have","'m":" am"}
NEG_CUES = {"not","never","no","n't","cannot"}
STOPWORDS = {"the","and","a","an","of","in","on","at","to","is","it","this","that","i","you","he","she","they","we","was","were","be","been","am","are","but","if","or","as","with","for","not","no","so","too","very"}

def expand_contractions(t):
    for p,r in CONTRACTIONS.items():
        t = re.sub(p,r,t)
    return t

def basic_clean(t):
    t = expand_contractions(t.lower())
    t = HTML_PATTERN.sub(" ",t)
    t = URL_PATTERN.sub(" ",t)
    t = ALLOWED_CHARS.sub(" ",t)
    return re.sub(r"\s+"," ",t).strip()

def clean_series(s):
    return s.astype(str).fillna("").map(basic_clean)

def mark_negations(s):
    out = []
    for txt in s:
        neg = False
        buf = []
        for tok in txt.split():
            if tok in NEG_CUES:
                buf.append(f"{tok}_NEG")
                neg = True
            elif neg and re.fullmatch(r"[a-z0-9']+",tok):
                buf.append(f"{tok}_NEG")
            else:
                buf.append(tok)
                if re.search(r"[\.\!\?]$",tok):
                    neg = False
        out.append(" ".join(buf))
    return pd.Series(out,index=s.index)

def remove_stopwords(s):
    return s.map(lambda t:" ".join(w for w in t.split() if len(w)>1 and w not in STOPWORDS))

def show_cv_table(res, cols):
    df = pd.DataFrame(res)
    param_cols = [f"param_{c}" for c in cols]
    tbl = df[param_cols + ["mean_test_score", "std_test_score"]]
    tbl = tbl.rename(columns=dict(zip(param_cols, cols)))
    display(tbl.sort_values("mean_test_score", ascending=False))


def evaluate_model(y_true,y_pred,y_prob,name):
    print(name)
    print("accuracy",accuracy_score(y_true,y_pred))
    print("f1_macro",f1_score(y_true,y_pred,average="macro"))
    cm = confusion_matrix(y_true,y_pred)
    plt.figure(figsize=(4,4)); plt.imshow(cm,cmap="Blues"); plt.xticks([0,1]); plt.yticks([0,1])
    for i in range(2):
        for j in range(2):
            plt.text(j,i,str(cm[i,j]),ha="center",va="center",color="white" if cm[i,j]>cm.max()/2 else "black")
    plt.show()
    fpr,tpr,_ = roc_curve(y_true,y_prob)
    plt.figure(); plt.plot(fpr,tpr,label="AUC="+str(auc(fpr,tpr))); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.legend(); plt.show()

def run_cv(build_fn,X,y):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    pipes,grids = build_fn()
    best_models = {}
    scores = {}
    for name,pipe in pipes.items():
        gs = GridSearchCV(pipe,grids[name],cv=cv,scoring="f1_macro",n_jobs=-1,verbose=1)
        gs.fit(X,y)
        show_cv_table(gs.cv_results_,list(grids[name].keys()))
        best_models[name] = gs.best_estimator_
        if name == "SVM":
            scr = cross_val_predict(best_models[name],X,y,cv=cv,method="decision_function")
            preds = (scr >= 0).astype(int)
            probs = expit(scr)
        else:
            preds = cross_val_predict(best_models[name],X,y,cv=cv,method="predict")
            probs = cross_val_predict(best_models[name],X,y,cv=cv,method="predict_proba")[:,1]
        evaluate_model(y,preds,probs,name)
        scores[name] = f1_score(y,preds,average="macro")
    best_name = max(scores,key=scores.get)
    return best_models[best_name]

def load_data_fp():
    tr = pd.read_csv("train2.csv").drop_duplicates()
    te = pd.read_csv("test2.csv").drop_duplicates()
    tr["combined"] = tr["reviewText"].fillna("")+" "+tr["summary"].fillna("")
    te["combined"] = te["reviewText"].fillna("")+" "+te["summary"].fillna("")
    tr = tr[tr["combined"].str.len()>10].reset_index(drop=True)
    X_tr = remove_stopwords(mark_negations(clean_series(tr["combined"])))
    X_te = remove_stopwords(mark_negations(clean_series(te["combined"])))
    y = (tr["overall"]>2).astype(int).values
    ids = te["id"].tolist()
    return X_tr.tolist(), X_te.tolist(), y, ids

def build_pipelines_fp():
    cnt = CountVectorizer(max_features=10000,ngram_range=(1,2),stop_words="english",min_df=3,max_df=0.7,binary=True)
    tfd = TfidfVectorizer(max_features=10000,ngram_range=(1,2),stop_words="english",min_df=3,max_df=0.7,sublinear_tf=True)
    pipe_nb = Pipeline([("vect",cnt),("clf",MultinomialNB())])
    pipe_lr = Pipeline([("vect",tfd),("clf",LogisticRegression(class_weight="balanced",solver="liblinear",max_iter=300))])
    pipe_svm = Pipeline([("vect",TfidfVectorizer(stop_words="english",lowercase=True,max_df=0.8,min_df=3,sublinear_tf=True,ngram_range=(1,2),max_features=10000)),("clf",LinearSVC(class_weight="balanced",max_iter=3000))])
    grids = {"NB":{"clf__alpha":[0.1,1]},"LR":{"clf__C":[0.5,1]},"SVM":{"clf__C":[0.5,1]}}
    return {"NB":pipe_nb,"LR":pipe_lr,"SVM":pipe_svm},grids

if __name__ == "__main__":
    X_tr,X_te,y,ids = load_data_fp()
    best_model = run_cv(build_pipelines_fp,X_tr,y)
    preds = best_model.predict(X_te)
    pd.DataFrame({"id":ids,"overall":preds}).to_csv("text2_output1.csv",index=False)

def load_data_mid():
    tr = pd.read_csv("train2.csv").drop_duplicates()
    te = pd.read_csv("test2.csv").drop_duplicates()
    for df in (tr,te):
        df["reviewText"] = mark_negations(clean_series(df["reviewText"]))
        df["summary"] = mark_negations(clean_series(df["summary"]))
        df["category"] = df["category"].astype(str).fillna("")
        df["verified"] = df["verified"].fillna(False).astype(int)
        df["vote"] = pd.to_numeric(df["vote"].astype(str).str.replace(",",""),errors="coerce").fillna(0.0)
    y = (tr["overall"]>2).astype(int).values
    ids = te["id"].tolist()
    return tr.drop(columns=["overall"]), te, y, ids

def preprocessor_mid():
    tfidf_review = TfidfVectorizer(max_features=60000,ngram_range=(1,2),stop_words="english",min_df=3,sublinear_tf=True)
    tfidf_summary = TfidfVectorizer(max_features=15000,ngram_range=(1,1),stop_words="english",min_df=2,sublinear_tf=True)
    tfidf_category = TfidfVectorizer(analyzer="char",ngram_range=(3,5),min_df=1)
    return ColumnTransformer([
        ("review",tfidf_review,"reviewText"),
        ("summary",tfidf_summary,"summary"),
        ("category",tfidf_category,"category"),
        ("verified",OneHotEncoder(handle_unknown="ignore"),["verified"]),
        ("vote",FunctionTransformer(func=np.log1p,validate=False,feature_names_out="one-to-one"),["vote"])
    ],remainder="drop",sparse_threshold=0.3)

def build_pipelines_mid():
    feat = preprocessor_mid()
    pipe_nb = Pipeline([("feat",feat),("clf",MultinomialNB())])
    pipe_lr = Pipeline([("feat",feat),("clf",LogisticRegression(class_weight="balanced",solver="liblinear",max_iter=2000))])
    pipe_svm = Pipeline([("feat",feat),("clf",LinearSVC(class_weight="balanced",max_iter=3000))])
    grids = {"NB":{"clf__alpha":[0.1,0.5,1.0]},"LR":{"clf__C":[0.5,1.0,2.0]},"SVM":{"clf__C":[0.5,1.0,2.0]}}
    return {"NB":pipe_nb,"LR":pipe_lr,"SVM":pipe_svm},grids

if __name__ == "__main__":
    X_tr,X_te,y,ids = load_data_mid()
    best_model = run_cv(build_pipelines_mid,X_tr,y)
    preds = best_model.predict(X_te)
    pd.DataFrame({"id":ids,"overall":preds}).to_csv("test2_output2.csv",index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'train2.csv'

In [None]:
import re
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from scipy.special import expit
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.metrics import (accuracy_score,confusion_matrix, f1_score, roc_auc_score, roc_curve, auc)
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict

warnings.filterwarnings("ignore", category=FutureWarning)

warnings.filterwarnings("ignore", category=FutureWarning)


URL_PATTERN = re.compile(r"https?://\S+|www\.\S+")
HTML_PATTERN = re.compile(r"<.*?>")
ALLOWED_CHARS = re.compile(r"[^a-z0-9\.\!\?\'\s]")
CONTRACTIONS = {"can't":"cannot","won't":"will not","n't":" not","'re":" are","'s":" is","'d":" would","'ll":" will","'ve":" have","'m":" am"}
NEG_CUES = {"not","never","no","n't","cannot"}
STOPWORDS = {"the","and","a","an","of","in","on","at","to","is","it","this","that","i","you","he","she","they","we","was","were","be","been","am","are","but","if","or","as","with","for","not","no","so","too","very"}

def expand_contractions(t):
    for p,r in CONTRACTIONS.items():
        t = re.sub(p,r,t)
    return t

def basic_clean(t):
    t = expand_contractions(t.lower())
    t = HTML_PATTERN.sub(" ",t)
    t = URL_PATTERN.sub(" ",t)
    t = ALLOWED_CHARS.sub(" ",t)
    return re.sub(r"\s+"," ",t).strip()

def clean_series(s):
    return s.astype(str).fillna("").map(basic_clean)

def mark_negations(s):
    out = []
    for txt in s:
        neg = False
        buf = []
        for tok in txt.split():
            if tok in NEG_CUES:
                buf.append(f"{tok}_NEG")
                neg = True
            elif neg and re.fullmatch(r"[a-z0-9']+",tok):
                buf.append(f"{tok}_NEG")
            else:
                buf.append(tok)
                if re.search(r"[\.\!\?]$",tok):
                    neg = False
        out.append(" ".join(buf))
    return pd.Series(out,index=s.index)

def remove_stopwords(s):
    return s.map(lambda t:" ".join(w for w in t.split() if len(w)>1 and w not in STOPWORDS))

def show_cv_table(res, cols):
    df = pd.DataFrame(res)
    param_cols = [f"param_{c}" for c in cols]
    tbl = df[param_cols + ["mean_test_score", "std_test_score"]]
    tbl = tbl.rename(columns=dict(zip(param_cols, cols)))
    display(tbl.sort_values("mean_test_score", ascending=False))


def evaluate_model(y_true,y_pred,y_prob,name):
    print(name)
    print("accuracy",accuracy_score(y_true,y_pred))
    print("f1_macro",f1_score(y_true,y_pred,average="macro"))
    cm = confusion_matrix(y_true,y_pred)
    plt.figure(figsize=(4,4)); plt.imshow(cm,cmap="Blues"); plt.xticks([0,1]); plt.yticks([0,1])
    for i in range(2):
        for j in range(2):
            plt.text(j,i,str(cm[i,j]),ha="center",va="center",color="white" if cm[i,j]>cm.max()/2 else "black")
    plt.show()
    fpr,tpr,_ = roc_curve(y_true,y_prob)
    plt.figure(); plt.plot(fpr,tpr,label="AUC="+str(auc(fpr,tpr))); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.legend(); plt.show()

def run_cv(build_fn,X,y):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    pipes,grids = build_fn()
    best_models = {}
    scores = {}
    for name,pipe in pipes.items():
        gs = GridSearchCV(pipe,grids[name],cv=cv,scoring="f1_macro",n_jobs=-1,verbose=1)
        gs.fit(X,y)
        show_cv_table(gs.cv_results_,list(grids[name].keys()))
        best_models[name] = gs.best_estimator_
        if name == "SVM":
            scr = cross_val_predict(best_models[name],X,y,cv=cv,method="decision_function")
            preds = (scr >= 0).astype(int)
            probs = expit(scr)
        else:
            preds = cross_val_predict(best_models[name],X,y,cv=cv,method="predict")
            probs = cross_val_predict(best_models[name],X,y,cv=cv,method="predict_proba")[:,1]
        evaluate_model(y,preds,probs,name)
        scores[name] = f1_score(y,preds,average="macro")
    best_name = max(scores,key=scores.get)
    return best_models[best_name]

def load_data_fp():
    tr = pd.read_csv("train3.csv").drop_duplicates()
    te = pd.read_csv("test3.csv").drop_duplicates()
    tr["combined"] = tr["reviewText"].fillna("")+" "+tr["summary"].fillna("")
    te["combined"] = te["reviewText"].fillna("")+" "+te["summary"].fillna("")
    tr = tr[tr["combined"].str.len()>10].reset_index(drop=True)
    X_tr = remove_stopwords(mark_negations(clean_series(tr["combined"])))
    X_te = remove_stopwords(mark_negations(clean_series(te["combined"])))
    y = (tr["overall"]>3).astype(int).values
    ids = te["id"].tolist()
    return X_tr.tolist(), X_te.tolist(), y, ids

def build_pipelines_fp():
    cnt = CountVectorizer(max_features=10000,ngram_range=(1,2),stop_words="english",min_df=3,max_df=0.7,binary=True)
    tfd = TfidfVectorizer(max_features=10000,ngram_range=(1,2),stop_words="english",min_df=3,max_df=0.7,sublinear_tf=True)
    pipe_nb = Pipeline([("vect",cnt),("clf",MultinomialNB())])
    pipe_lr = Pipeline([("vect",tfd),("clf",LogisticRegression(class_weight="balanced",solver="liblinear",max_iter=300))])
    pipe_svm = Pipeline([("vect",TfidfVectorizer(stop_words="english",lowercase=True,max_df=0.8,min_df=3,sublinear_tf=True,ngram_range=(1,2),max_features=10000)),("clf",LinearSVC(class_weight="balanced",max_iter=3000))])
    grids = {"NB":{"clf__alpha":[0.1,1]},"LR":{"clf__C":[0.5,1]},"SVM":{"clf__C":[0.5,1]}}
    return {"NB":pipe_nb,"LR":pipe_lr,"SVM":pipe_svm},grids

if __name__ == "__main__":
    X_tr,X_te,y,ids = load_data_fp()
    best_model = run_cv(build_pipelines_fp,X_tr,y)
    preds = best_model.predict(X_te)
    pd.DataFrame({"id":ids,"overall":preds}).to_csv("text3_output1.csv",index=False)

def load_data_mid():
    tr = pd.read_csv("train3.csv").drop_duplicates()
    te = pd.read_csv("test3.csv").drop_duplicates()
    for df in (tr,te):
        df["reviewText"] = mark_negations(clean_series(df["reviewText"]))
        df["summary"] = mark_negations(clean_series(df["summary"]))
        df["category"] = df["category"].astype(str).fillna("")
        df["verified"] = df["verified"].fillna(False).astype(int)
        df["vote"] = pd.to_numeric(df["vote"].astype(str).str.replace(",",""),errors="coerce").fillna(0.0)
    y = (tr["overall"]>3).astype(int).values
    ids = te["id"].tolist()
    return tr.drop(columns=["overall"]), te, y, ids

def preprocessor_mid():
    tfidf_review = TfidfVectorizer(max_features=60000,ngram_range=(1,2),stop_words="english",min_df=3,sublinear_tf=True)
    tfidf_summary = TfidfVectorizer(max_features=15000,ngram_range=(1,1),stop_words="english",min_df=2,sublinear_tf=True)
    tfidf_category = TfidfVectorizer(analyzer="char",ngram_range=(3,5),min_df=1)
    return ColumnTransformer([
        ("review",tfidf_review,"reviewText"),
        ("summary",tfidf_summary,"summary"),
        ("category",tfidf_category,"category"),
        ("verified",OneHotEncoder(handle_unknown="ignore"),["verified"]),
        ("vote",FunctionTransformer(func=np.log1p,validate=False,feature_names_out="one-to-one"),["vote"])
    ],remainder="drop",sparse_threshold=0.3)

def build_pipelines_mid():
    feat = preprocessor_mid()
    pipe_nb = Pipeline([("feat",feat),("clf",MultinomialNB())])
    pipe_lr = Pipeline([("feat",feat),("clf",LogisticRegression(class_weight="balanced",solver="liblinear",max_iter=2000))])
    pipe_svm = Pipeline([("feat",feat),("clf",LinearSVC(class_weight="balanced",max_iter=3000))])
    grids = {"NB":{"clf__alpha":[0.1,0.5,1.0]},"LR":{"clf__C":[0.5,1.0,2.0]},"SVM":{"clf__C":[0.5,1.0,2.0]}}
    return {"NB":pipe_nb,"LR":pipe_lr,"SVM":pipe_svm},grids

if __name__ == "__main__":
    X_tr,X_te,y,ids = load_data_mid()
    best_model = run_cv(build_pipelines_mid,X_tr,y)
    preds = best_model.predict(X_te)
    pd.DataFrame({"id":ids,"overall":preds}).to_csv("test3_output3.csv",index=False)

In [None]:
import re
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from scipy.special import expit
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.metrics import (accuracy_score,confusion_matrix, f1_score, roc_auc_score, roc_curve, auc)
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict

warnings.filterwarnings("ignore", category=FutureWarning)

warnings.filterwarnings("ignore", category=FutureWarning)


URL_PATTERN = re.compile(r"https?://\S+|www\.\S+")
HTML_PATTERN = re.compile(r"<.*?>")
ALLOWED_CHARS = re.compile(r"[^a-z0-9\.\!\?\'\s]")
CONTRACTIONS = {"can't":"cannot","won't":"will not","n't":" not","'re":" are","'s":" is","'d":" would","'ll":" will","'ve":" have","'m":" am"}
NEG_CUES = {"not","never","no","n't","cannot"}
STOPWORDS = {"the","and","a","an","of","in","on","at","to","is","it","this","that","i","you","he","she","they","we","was","were","be","been","am","are","but","if","or","as","with","for","not","no","so","too","very"}

def expand_contractions(t):
    for p,r in CONTRACTIONS.items():
        t = re.sub(p,r,t)
    return t

def basic_clean(t):
    t = expand_contractions(t.lower())
    t = HTML_PATTERN.sub(" ",t)
    t = URL_PATTERN.sub(" ",t)
    t = ALLOWED_CHARS.sub(" ",t)
    return re.sub(r"\s+"," ",t).strip()

def clean_series(s):
    return s.astype(str).fillna("").map(basic_clean)

def mark_negations(s):
    out = []
    for txt in s:
        neg = False
        buf = []
        for tok in txt.split():
            if tok in NEG_CUES:
                buf.append(f"{tok}_NEG")
                neg = True
            elif neg and re.fullmatch(r"[a-z0-9']+",tok):
                buf.append(f"{tok}_NEG")
            else:
                buf.append(tok)
                if re.search(r"[\.\!\?]$",tok):
                    neg = False
        out.append(" ".join(buf))
    return pd.Series(out,index=s.index)

def remove_stopwords(s):
    return s.map(lambda t:" ".join(w for w in t.split() if len(w)>1 and w not in STOPWORDS))

def show_cv_table(res, cols):
    df = pd.DataFrame(res)
    param_cols = [f"param_{c}" for c in cols]
    tbl = df[param_cols + ["mean_test_score", "std_test_score"]]
    tbl = tbl.rename(columns=dict(zip(param_cols, cols)))
    display(tbl.sort_values("mean_test_score", ascending=False))


def evaluate_model(y_true,y_pred,y_prob,name):
    print(name)
    print("accuracy",accuracy_score(y_true,y_pred))
    print("f1_macro",f1_score(y_true,y_pred,average="macro"))
    cm = confusion_matrix(y_true,y_pred)
    plt.figure(figsize=(4,4)); plt.imshow(cm,cmap="Blues"); plt.xticks([0,1]); plt.yticks([0,1])
    for i in range(2):
        for j in range(2):
            plt.text(j,i,str(cm[i,j]),ha="center",va="center",color="white" if cm[i,j]>cm.max()/2 else "black")
    plt.show()
    fpr,tpr,_ = roc_curve(y_true,y_prob)
    plt.figure(); plt.plot(fpr,tpr,label="AUC="+str(auc(fpr,tpr))); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.legend(); plt.show()

def run_cv(build_fn,X,y):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    pipes,grids = build_fn()
    best_models = {}
    scores = {}
    for name,pipe in pipes.items():
        gs = GridSearchCV(pipe,grids[name],cv=cv,scoring="f1_macro",n_jobs=-1,verbose=1)
        gs.fit(X,y)
        show_cv_table(gs.cv_results_,list(grids[name].keys()))
        best_models[name] = gs.best_estimator_
        if name == "SVM":
            scr = cross_val_predict(best_models[name],X,y,cv=cv,method="decision_function")
            preds = (scr >= 0).astype(int)
            probs = expit(scr)
        else:
            preds = cross_val_predict(best_models[name],X,y,cv=cv,method="predict")
            probs = cross_val_predict(best_models[name],X,y,cv=cv,method="predict_proba")[:,1]
        evaluate_model(y,preds,probs,name)
        scores[name] = f1_score(y,preds,average="macro")
    best_name = max(scores,key=scores.get)
    return best_models[best_name]

def load_data_fp():
    tr = pd.read_csv("train4.csv").drop_duplicates()
    te = pd.read_csv("test4.csv").drop_duplicates()
    tr["combined"] = tr["reviewText"].fillna("")+" "+tr["summary"].fillna("")
    te["combined"] = te["reviewText"].fillna("")+" "+te["summary"].fillna("")
    tr = tr[tr["combined"].str.len()>10].reset_index(drop=True)
    X_tr = remove_stopwords(mark_negations(clean_series(tr["combined"])))
    X_te = remove_stopwords(mark_negations(clean_series(te["combined"])))
    y = (tr["overall"]>4).astype(int).values
    ids = te["id"].tolist()
    return X_tr.tolist(), X_te.tolist(), y, ids

def build_pipelines_fp():
    cnt = CountVectorizer(max_features=10000,ngram_range=(1,2),stop_words="english",min_df=3,max_df=0.7,binary=True)
    tfd = TfidfVectorizer(max_features=10000,ngram_range=(1,2),stop_words="english",min_df=3,max_df=0.7,sublinear_tf=True)
    pipe_nb = Pipeline([("vect",cnt),("clf",MultinomialNB())])
    pipe_lr = Pipeline([("vect",tfd),("clf",LogisticRegression(class_weight="balanced",solver="liblinear",max_iter=300))])
    pipe_svm = Pipeline([("vect",TfidfVectorizer(stop_words="english",lowercase=True,max_df=0.8,min_df=3,sublinear_tf=True,ngram_range=(1,2),max_features=10000)),("clf",LinearSVC(class_weight="balanced",max_iter=3000))])
    grids = {"NB":{"clf__alpha":[0.1,1]},"LR":{"clf__C":[0.5,1]},"SVM":{"clf__C":[0.5,1]}}
    return {"NB":pipe_nb,"LR":pipe_lr,"SVM":pipe_svm},grids

if __name__ == "__main__":
    X_tr,X_te,y,ids = load_data_fp()
    best_model = run_cv(build_pipelines_fp,X_tr,y)
    preds = best_model.predict(X_te)
    pd.DataFrame({"id":ids,"overall":preds}).to_csv("text4_output1.csv",index=False)

def load_data_mid():
    tr = pd.read_csv("train4.csv").drop_duplicates()
    te = pd.read_csv("test4.csv").drop_duplicates()
    for df in (tr,te):
        df["reviewText"] = mark_negations(clean_series(df["reviewText"]))
        df["summary"] = mark_negations(clean_series(df["summary"]))
        df["category"] = df["category"].astype(str).fillna("")
        df["verified"] = df["verified"].fillna(False).astype(int)
        df["vote"] = pd.to_numeric(df["vote"].astype(str).str.replace(",",""),errors="coerce").fillna(0.0)
    y = (tr["overall"]>4).astype(int).values
    ids = te["id"].tolist()
    return tr.drop(columns=["overall"]), te, y, ids

def preprocessor_mid():
    tfidf_review = TfidfVectorizer(max_features=60000,ngram_range=(1,2),stop_words="english",min_df=3,sublinear_tf=True)
    tfidf_summary = TfidfVectorizer(max_features=15000,ngram_range=(1,1),stop_words="english",min_df=2,sublinear_tf=True)
    tfidf_category = TfidfVectorizer(analyzer="char",ngram_range=(3,5),min_df=1)
    return ColumnTransformer([
        ("review",tfidf_review,"reviewText"),
        ("summary",tfidf_summary,"summary"),
        ("category",tfidf_category,"category"),
        ("verified",OneHotEncoder(handle_unknown="ignore"),["verified"]),
        ("vote",FunctionTransformer(func=np.log1p,validate=False,feature_names_out="one-to-one"),["vote"])
    ],remainder="drop",sparse_threshold=0.3)

def build_pipelines_mid():
    feat = preprocessor_mid()
    pipe_nb = Pipeline([("feat",feat),("clf",MultinomialNB())])
    pipe_lr = Pipeline([("feat",feat),("clf",LogisticRegression(class_weight="balanced",solver="liblinear",max_iter=2000))])
    pipe_svm = Pipeline([("feat",feat),("clf",LinearSVC(class_weight="balanced",max_iter=3000))])
    grids = {"NB":{"clf__alpha":[0.1,0.5,1.0]},"LR":{"clf__C":[0.5,1.0,2.0]},"SVM":{"clf__C":[0.5,1.0,2.0]}}
    return {"NB":pipe_nb,"LR":pipe_lr,"SVM":pipe_svm},grids

if __name__ == "__main__":
    X_tr,X_te,y,ids = load_data_mid()
    best_model = run_cv(build_pipelines_mid,X_tr,y)
    preds = best_model.predict(X_te)
    pd.DataFrame({"id":ids,"overall":preds}).to_csv("test4_output4.csv",index=False)