In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

# Load the processed data
train_df = pd.read_csv("data/processed/train_processed.csv")
val_df = pd.read_csv("data/processed/validation_processed.csv")
test_df = pd.read_csv("data/processed/test_engineered.csv")

# Separate features and target
X_train, y_train = train_df.drop(columns=["Attrition"]), train_df["Attrition"]
X_val, y_val = val_df.drop(columns=["Attrition"]), val_df["Attrition"]
X_test = test_df

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test) # Scale test data

# Convert scaled data to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Hyperparameter Tuning using GridSearchCV
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'lbfgs'] # lbfgs does not support l1
}

logistic_model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=2000)
grid_search = GridSearchCV(logistic_model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Cross-Validation
cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=5, scoring='roc_auc')
print("Cross-Validation ROC AUC:", cv_scores.mean(), "±", cv_scores.std())

# Model Evaluation on Validation Set
y_pred_val = best_model.predict(X_val_scaled)
y_pred_proba_val = best_model.predict_proba(X_val_scaled)[:, 1]

print("\nValidation Set Metrics:")
print("Accuracy:", accuracy_score(y_val, y_pred_val))
print("Precision:", precision_score(y_val, y_pred_val))
print("Recall:", recall_score(y_val, y_pred_val))
print("F1 Score:", f1_score(y_val, y_pred_val))
print("ROC AUC:", roc_auc_score(y_val, y_pred_proba_val))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred_val))

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba_val)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# Feature Importance Analysis
coefficients = best_model.coef_[0]
feature_importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': abs(coefficients)})
feature_importance = feature_importance.sort_values('Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance.head(10))

# Apply the model on the test       data.
y_pred_test = best_model.predict(X_test_scaled)
y_pred_proba_test = best_model.predict_proba(X_test_scaled)[:, 1]

# Create a dataframe with the results.
test_results = pd.DataFrame({'Attrition_Probability': y_pred_proba_test, 'Attrition_Prediction': y_pred_test})
print("\nTest set predictions:")
print(test_results.head())

# If you would like to save the results to a CSV.
# test_results.to_csv('test_predictions_logistic_regression.csv', index=False)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Attrition
- Dependents_Ratio
- Distance_Bin_Far
- Distance_Bin_Medium
- Distance_Bin_Near
- ...
