In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('reddit_preprocessing.csv').dropna(subset=['clean_comment'])
df.head()

Unnamed: 0,clean_comment,category
0,"film absolutely awful, but nevertheless, hilar...",0
1,well since seeing part 1 3 honestly say never ...,0
2,got see film preview dazzled it. not typical r...,1
3,adaptation positively butcher classic beloved ...,0
4,rzone awful movie! simple. seems tried make mo...,0


In [4]:
import os
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
# English: Split data into training and testing sets.

X = df['clean_comment'].astype(str)
y = df['category'].astype(str)

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20,random_state=42,stratify=y)

In [6]:
# English: Create pipeline with vectorizer and logistic regression.

pipe = Pipeline([
    ("vec", TfidfVectorizer()),  # Placeholder, will change in GridSearch
    ("clf", LogisticRegression(max_iter=500))
])

In [7]:
# English: Define parameter grid for GridSearchCV.

param_grid = [
    {
        "vec": [CountVectorizer()],
        "vec__ngram_range": [(1,1), (1,2), (1,3)],
        "vec__max_features": [5000, 10000, None],
        "clf__penalty": ["l2"],
        "clf__solver": ["lbfgs", "saga"],
        "clf__C": [0.5, 1.0, 2.0, 5.0],
        "clf__class_weight": [None, "balanced"],
        "clf__random_state": [42]
    },
    {
        "vec": [TfidfVectorizer()],
        "vec__ngram_range": [(1,1), (1,2), (1,3)],
        "vec__max_features": [5000, 10000, None],
        "clf__penalty": ["l2"],
        "clf__solver": ["lbfgs", "saga"],
        "clf__C": [0.5, 1.0, 2.0, 5.0],
        "clf__class_weight": [None, "balanced"],
        "clf__random_state": [42]
    }
]

In [8]:
# Perform hyperparameter tuning to find the best model.

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring={"accuracy": "accuracy", "f1_macro": "f1_macro"},
    refit="f1_macro",
    n_jobs=-1,
    cv=cv,
    verbose=1
)

grid.fit(X_train, y_train)

print("\nBest Params:", grid.best_params_)
print("Best CV f1_macro:", grid.best_score_)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


KeyboardInterrupt: 