In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, make_scorer, fbeta_score
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFE
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN


In [None]:
df_train=pd.read_csv('../data/output/development_sample_cleaned_1.csv')
df_test=pd.read_csv('../data/output/testing_sample_cleaned.csv')

In [None]:
df_train = df_train.drop(['ID', 'customer_id'], axis=1)
df_test = df_test.drop(['ID', 'customer_id'], axis=1)

In [None]:
# Separate features and target variable
X = df_train.drop('target', axis=1)
y = df_train['target']


In [None]:
oversamplers = {
    'SMOTE': SMOTE(random_state=123),
    'RandomOverSampler': RandomOverSampler(random_state=123),
    'ADASYN': ADASYN(random_state=123)
}

# Define the parameters for GridSearchCV
param_grid = {
    'C': [0.8, 1, 1.2],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

In [None]:
f05_scorer = make_scorer(fbeta_score, beta=0.5)


In [None]:
log_reg = LogisticRegression(random_state=123, max_iter=1000)  # Increase max_iter

# Create a dictionary to store results for each oversampler
results_dict = {}
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
for name, oversampler in oversamplers.items():
    # Upsample the minority class using the current oversampler
    X_resampled, y_resampled = oversampler.fit_resample(X_train_scaled, y_train)

    # Initialize GridSearchCV with verbose for detailed progress
    grid_search = GridSearchCV(log_reg, param_grid, cv=3, scoring=f1_score, n_jobs=-1, verbose=4)

    # Fit GridSearchCV on the training data
    grid_search.fit(X_resampled, y_resampled)

    # Get the best parameters
    best_params = grid_search.best_params_
    print(f"\nBest Parameters for {name}:", best_params)

    # Store results in dictionary
    results = pd.DataFrame(grid_search.cv_results_)
    cols_to_keep = ['param_C', 'param_penalty', 'param_solver', 'mean_test_score', 'std_test_score']
    results_table = results[cols_to_keep].sort_values(by='mean_test_score', ascending=False)
    results_dict[name] = results_table

    # Get the best performing model
    best_log_reg = LogisticRegression(**best_params, random_state=123, max_iter=1000)  # Increase max_iter
    best_log_reg.fit(X_resampled, y_resampled)

    # Make predictions on the test set
    y_pred = best_log_reg.predict(X_test_scaled)    

    # Print the table for the current oversampler
    print(f"\nModel Specifications and Validation F05-score for {name}:")
    print(results_table)

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.show()
# Combine results for all oversamplers into a single table
combined_results = pd.concat(results_dict, axis=1)

# Print the combined results
print("\nCombined Results for all Oversamplers:")
print(combined_results)
