In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.exceptions import ConvergenceWarning 
import warnings

In [4]:
encoded_df = pd.read_csv("../data/encoded_df.csv")

In [6]:
# Separate features and target
X = encoded_df.drop('y', axis=1)
y = encoded_df['y']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Import necessary libraries
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import warnings
from sklearn.exceptions import ConvergenceWarning

pipeline = Pipeline([
    ('scaler', StandardScaler()), 
    ('model', LogisticRegression(random_state=42, max_iter=5000, class_weight='balanced'))
])


param_grid = {
    'model__C': [0.01, 0.1, 1, 10, 100],
    'model__solver': ['liblinear', 'lbfgs']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    # Pass the original X_train, the pipeline handles the rest
    grid_search.fit(X_train, y_train)


log_reg_best = grid_search.best_estimator_

y_pred = log_reg_best.predict(X_test)

print("Best Parameters found by Grid Search:", grid_search.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters found by Grid Search: {'model__C': 0.01, 'model__solver': 'lbfgs'}


In [7]:
print("Optimized Logistic Regression Model Evaluation:")
print("="*45)
print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Optimized Logistic Regression Model Evaluation:
Accuracy Score: 0.8619

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.86      0.92    197683
           1       0.46      0.87      0.61     27317

    accuracy                           0.86    225000
   macro avg       0.72      0.87      0.76    225000
weighted avg       0.92      0.86      0.88    225000


Confusion Matrix:
[[170131  27552]
 [  3516  23801]]
