# Grid searching ML model and creating learning curve for best params

In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict, train_test_split, learning_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import joblib
from datetime import datetime

In [None]:
#Import pre-processed training data
clean_df = pd.read_csv("../raw_data/train_df_ml_clean.csv")
clean_df = clean_df.dropna(subset=["clean_text"])
display(clean_df.head(), clean_df.shape, clean_df["label"].value_counts(normalize=True))

In [None]:
X_train_small = clean_df["clean_text"]
y_train_small = clean_df["label"].astype(int)
display(X_train_small.shape, y_train_small.shape)

In [None]:
X_train = joblib.load("../preprocessing_pipelines/train_tfidf_matrix.pkl")
X_train.shape

In [None]:
X_train_reduced = X_train[:1800000]
y_train_reduced = clean_df.iloc[:1800000,0]

In [None]:
# Implement GridSearch and model comparison | 36K observations
# Cross-validate
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define params
params = {
    "model__C": [0.01, 0.1, 1, 10]
}

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVC": LinearSVC(max_iter=1000)
}

# Run GridSearch
best_params_list = []
for name, model in models.items():
    print(f"GridSearchCV for {name}")

    pipeline = Pipeline([
        ("model", model)
    ])

    grid = GridSearchCV(
        pipeline,
        param_grid=params,
        cv=cv,
        scoring="accuracy",
        n_jobs=-1,
        verbose=1
    )

    grid.fit(X_train_reduced, y_train_reduced)

    print(f"Best CV accuracy: {grid.best_score_}")
    print(f"Best parameters: {grid.best_params_}")
    best_params = {
        "model": name,
        "C": grid.best_params_["model__C"]
    }
    best_params_list.append(best_params)

    # Create classification report
    y_pred = grid.best_estimator_.predict(X_train_reduced)

    print("Classification report:")
    print(classification_report(y_train_reduced, y_pred))

In [None]:
logreg_C = best_params_df.loc["Logistic Regression"]["C"].astype(int)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

model = LogisticRegression(C=logreg_C, max_iter=1000)

In [None]:
# Estimate learning curve inputs
train_sizes, train_scores, val_scores = learning_curve(
    estimator=model,
    X=X_train_reduced,
    y=y_train_reduced,
    train_sizes=np.linspace(0.1, 1.0, 5),
    cv=cv,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)

# Plot learning curve
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, marker="o", label="Training accuracy")
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2)

plt.plot(train_sizes, val_mean, marker="s", label="Validation accuracy")
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.2)

plt.title(f"Learning Curve (LogReg, C={logreg_C})")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
results_df = pd.DataFrame(grid.cv_results_)
results_df.to_csv('../documentation/grid_search_ml.csv')