# Findings and Recommendations

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, 
    roc_auc_score, precision_score, recall_score, f1_score
)
from imblearn.combine import SMOTETomek

In [None]:
# Load the dataset
data = pd.read_csv('../data/raw/data_for_predictions.csv')
data.drop(columns=['Unnamed: 0'], inplace=True)

# Display first few rows
print(data.head(3))

# Check for missing values
print(data.isnull().sum())

# Check class distribution
print(data['churn'].value_counts(normalize=True))

In [None]:
# Feature Engineering
data['total_cons'] = data['cons_12m'] + data['cons_gas_12m']
data['price_sensitivity'] = data['forecast_price_energy_peak'] - data['forecast_price_energy_off_peak']

# Define features and target
X = data.drop(columns=['churn', 'id'])
y = data['churn']

# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [None]:
# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)

# Handle class imbalance
smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X_processed, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

In [None]:
# Initialize and train Random Forest model
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

In [None]:
# Evaluate model performance
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'ROC-AUC Score': roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1]),
    'Precision': precision_score(y_test, y_pred),
    'Recall': recall_score(y_test, y_pred),
    'F1-Score': f1_score(y_test, y_pred)
}

# Print evaluation metrics
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42), param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='roc_auc', n_jobs=-1
)

grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)

In [None]:
# Train best model
best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test)

# Evaluate best model
best_metrics = {
    'Best Accuracy': accuracy_score(y_test, y_pred_best),
    'Best ROC-AUC Score': roc_auc_score(y_test, best_rf.predict_proba(X_test)[:, 1]),
    'Best Precision': precision_score(y_test, y_pred_best),
    'Best Recall': recall_score(y_test, y_pred_best),
    'Best F1-Score': f1_score(y_test, y_pred_best)
}

for metric, value in best_metrics.items():
    print(f"{metric}: {value:.4f}")

In [None]:
# Feature Importance
X_train = pd.DataFrame(X_train, columns=X.columns)
preprocessor.fit(X_train)

num_feature_names = numeric_features
cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features).tolist()
feature_names = num_feature_names + cat_feature_names

feature_importances = pd.DataFrame({
    'Feature': feature_names,
    'Importance': best_rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

In [None]:
# Plot top 15 features
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importances.head(15), palette='viridis')
plt.title('Top 15 Feature Importances')
plt.show()
