# Feature Engineering and Modelling

---

1. Import packages
2. Load data
3. Modelling

---

## 1. Import packages

In [29]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [30]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt

# Shows plots in jupyter notebook
%matplotlib inline

# Set plot style
sns.set(color_codes=True)

---
## 2. Load data

In [31]:
df = pd.read_csv('data_for_predictions.csv')
df.drop(columns=["Unnamed: 0"], inplace=True)
df.head()

Unnamed: 0,id,cons_12m,cons_gas_12m,cons_last_month,forecast_cons_12m,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_off_peak,forecast_price_energy_peak,forecast_price_pow_off_peak,...,months_modif_prod,months_renewal,channel_MISSING,channel_ewpakwlliwisiwduibdlfmalxowmwpci,channel_foosdfpfkusacimwkcsosbicdxkicaua,channel_lmkebamcaaclubfxadlmueccxoimlema,channel_usilxuppasemubllopkaafesmlibmsdf,origin_up_kamkkxfxxuwbdslkwifmmcsiusiuosws,origin_up_ldkssxwpmemidmecebumciepifcamkci,origin_up_lxidpiddsbxsbosboudacockeimpuepw
0,24011ae4ebbe3035111d65fa7c15bc57,0.0,4.739944,0.0,0.0,0.0,0.444045,0.114481,0.098142,40.606701,...,2,6,0,0,1,0,0,0,0,1
1,d29c2c54acc38ff3c0614d0a653813dd,3.668479,0.0,0.0,2.28092,0.0,1.237292,0.145711,0.0,44.311378,...,76,4,1,0,0,0,0,1,0,0
2,764c75f661154dac3a6c254cd082ea7d,2.736397,0.0,0.0,1.689841,0.0,1.599009,0.165794,0.087899,44.311378,...,68,8,0,0,1,0,0,1,0,0
3,bba03439a292a1e166f80264c16191cb,3.200029,0.0,0.0,2.382089,0.0,1.318689,0.146694,0.0,44.311378,...,69,9,0,0,0,1,0,1,0,0
4,149d57cf92fc41cf94415803a877cb4b,3.646011,0.0,2.721811,2.650065,0.0,2.122969,0.1169,0.100015,40.606701,...,71,9,1,0,0,0,0,1,0,0


In [32]:
df.shape

(14606, 63)

---

## 3. Modelling

We now have a dataset containing features that we have engineered and we are ready to start training a predictive model. Remember, we only need to focus on training a `Random Forest` classifier.

In [33]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

### Data sampling

The first thing we want to do is split our dataset into training and test samples. The reason why we do this, is so that we can simulate a real life situation by generating predictions for our test sample, without showing the predictive model these data points. This gives us the ability to see how well our model is able to generalise to new data, which is critical.

A typical % to dedicate to testing is between 20-30, for this example we will use a 75-25% split between train and test respectively.

In [34]:
# Feature Selection - Add this BEFORE data splitting

# 1. Check feature importance from a quick Random Forest
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Separate features and target
y = df['churn']
X = df.drop(columns=['id', 'churn'])

# Train a quick Random Forest to get feature importance
quick_rf = RandomForestClassifier(n_estimators=50, random_state=42)
quick_rf.fit(X, y)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': quick_rf.feature_importances_
}).sort_values('importance', ascending=False)

print("=== FEATURE IMPORTANCE ===")
print(feature_importance.head(20))  # Show top 20 features

# 2. Alternative: Check correlation with target
correlation_with_target = X.corrwith(y).abs().sort_values(ascending=False)
print("\n=== CORRELATION WITH CHURN ===")
print(correlation_with_target.head(20))

# 3. Remove low importance features
# Method 1: Keep only top N features
top_n_features = 30  # Adjust this number
top_features = feature_importance.head(top_n_features)['feature'].tolist()

# Method 2: Remove features with very low importance
importance_threshold = 0.01  # Features with <1% importance
important_features = feature_importance[feature_importance['importance'] > importance_threshold]['feature'].tolist()

print(f"\n=== FEATURE SELECTION RESULTS ===")
print(f"Original features: {len(X.columns)}")
print(f"Top {top_n_features} features: {len(top_features)}")
print(f"Features above {importance_threshold} threshold: {len(important_features)}")

# Choose which method to use (let's use the threshold method)
selected_features = important_features  # or use top_features

# Create new dataset with selected features
X_selected = X[selected_features]
print(f"\nSelected {len(selected_features)} features for modeling")
print("Selected features:", selected_features[:10], "...")  # Show first 10

=== FEATURE IMPORTANCE ===
                                   feature  importance
0                                 cons_12m    0.053877
5                  forecast_meter_rent_12m    0.053481
14                              net_margin    0.053208
3                        forecast_cons_12m    0.051403
12                      margin_net_pow_ele    0.049235
11                    margin_gross_pow_ele    0.046075
15                                 pow_max    0.038668
2                          cons_last_month    0.037646
10                                imp_cons    0.032539
51                       months_modif_prod    0.029954
49                            months_activ    0.028915
38         off_peak_mid_peak_var_mean_diff    0.028512
36             off_peak_peak_var_mean_diff    0.028110
22                 var_year_price_off_peak    0.027233
34         offpeak_diff_dec_january_energy    0.026689
16             var_year_price_off_peak_var    0.025293
6           forecast_price_energy_off_

In [35]:
# Update your data splitting section
# Separate target variable from selected features
y = df['churn']
X = X_selected  # Use selected features instead of all features

print(f"Final feature set shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Continue with your existing train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

Final feature set shape: (14606, 32)
Target shape: (14606,)
(10954, 32)
(10954,)
(3652, 32)
(3652,)


### Model training

Once again, we are using a `Random Forest` classifier in this example. A Random Forest sits within the category of `ensemble` algorithms because internally the `Forest` refers to a collection of `Decision Trees` which are tree-based learning algorithms. As the data scientist, you can control how large the forest is (that is, how many decision trees you want to include).

The reason why an `ensemble` algorithm is powerful is because of the laws of averaging, weak learners and the central limit theorem. If we take a single decision tree and give it a sample of data and some parameters, it will learn patterns from the data. It may be overfit or it may be underfit, but that is now our only hope, that single algorithm. 

With `ensemble` methods, instead of banking on 1 single trained model, we can train 1000's of decision trees, all using different splits of the data and learning different patterns. It would be like asking 1000 people to all learn how to code. You would end up with 1000 people with different answers, methods and styles! The weak learner notion applies here too, it has been found that if you train your learners not to overfit, but to learn weak patterns within the data and you have a lot of these weak learners, together they come together to form a highly predictive pool of knowledge! This is a real life application of many brains are better than 1.

Now instead of relying on 1 single decision tree for prediction, the random forest puts it to the overall views of the entire collection of decision trees. Some ensemble algorithms using a voting approach to decide which prediction is best, others using averaging. 

As we increase the number of learners, the idea is that the random forest's performance should converge to its best possible solution.

Some additional advantages of the random forest classifier include:

- The random forest uses a rule-based approach instead of a distance calculation and so features do not need to be scaled
- It is able to handle non-linear parameters better than linear based models

On the flip side, some disadvantages of the random forest classifier include:

- The computational power needed to train a random forest on a large dataset is high, since we need to build a whole ensemble of estimators.
- Training time can be longer due to the increased complexity and size of thee ensemble

In [36]:
# Add model training in here!
model = RandomForestClassifier() # Add parameters to the model!
model.fit(X_train,y_train) # Complete this method call!

### Evaluation

Now let's evaluate how well this trained model is able to predict the values of the test dataset.

In [37]:
# Check churn column distribution
print("=== CHURN DISTRIBUTION ===")
print(df['churn'].value_counts())
print("\nPercentages:")
print(df['churn'].value_counts(normalize=True) * 100)

=== CHURN DISTRIBUTION ===
churn
0    13187
1     1419
Name: count, dtype: int64

Percentages:
churn
0    90.284814
1     9.715186
Name: proportion, dtype: float64


In [38]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Check original distribution of churn
print("=== BEFORE SMOTE ===")
print("Training set distribution:")
print(Counter(y_train))
print(f"Percentage - No Churn: {(Counter(y_train)[0]/len(y_train))*100:.1f}%")
print(f"Percentage - Churn: {(Counter(y_train)[1]/len(y_train))*100:.1f}%")

smote = SMOTE(random_state=42, sampling_strategy='auto')
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Check new distribution
print("\n=== AFTER SMOTE ===")
print("Balanced training set distribution:")
print(Counter(y_train_balanced))
print(f"Percentage - No Churn: {(Counter(y_train_balanced)[0]/len(y_train_balanced))*100:.1f}%")
print(f"Percentage - Churn: {(Counter(y_train_balanced)[1]/len(y_train_balanced))*100:.1f}%")

print(f"\nOriginal training size: {X_train.shape}")
print(f"Balanced training size: {X_train_balanced.shape}")

# Train new model with balanced data
model_balanced = RandomForestClassifier(random_state=42)
model_balanced.fit(X_train_balanced, y_train_balanced)



=== BEFORE SMOTE ===
Training set distribution:
Counter({0: 9901, 1: 1053})
Percentage - No Churn: 90.4%
Percentage - Churn: 9.6%

=== AFTER SMOTE ===
Balanced training set distribution:
Counter({1: 9901, 0: 9901})
Percentage - No Churn: 50.0%
Percentage - Churn: 50.0%

Original training size: (10954, 32)
Balanced training size: (19802, 32)


In [39]:
# hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
import numpy as np

print("=== HYPERPARAMETER TUNING ===")

# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', 'balanced_subsample'],
    'random_state': [42]
}

# For faster tuning, use RandomizedSearchCV instead of GridSearchCV
print("Starting hyperparameter tuning...")
print("This may take a few minutes...")

# Use F1 score as the scoring metric (good for imbalanced data)
rf_tuned = RandomizedSearchCV(
    estimator=RandomForestClassifier(),
    param_distributions=param_grid,
    n_iter=20,  # Number of parameter combinations to try
    cv=3,       # 3-fold cross-validation
    scoring='f1',  # Use F1 score for evaluation
    n_jobs=-1,  # Use all available cores
    random_state=42,
    verbose=1   # Show progress
)

# Fit on balanced training data
rf_tuned.fit(X_train_balanced, y_train_balanced)

print(f"\n=== BEST PARAMETERS FOUND ===")
print(rf_tuned.best_params_)
print(f"Best Cross-Validation F1 Score: {rf_tuned.best_score_:.4f}")

# Get the best model
model_tuned = rf_tuned.best_estimator_
print(f"\nBest model: {model_tuned}")

# Compare with original balanced model
print(f"\n=== TRAINING BOTH MODELS FOR COMPARISON ===")
print("Training original balanced model...")
model_balanced = RandomForestClassifier(random_state=42)
model_balanced.fit(X_train_balanced, y_train_balanced)

print("Best tuned model is already trained from GridSearch!")

=== HYPERPARAMETER TUNING ===
Starting hyperparameter tuning...
This may take a few minutes...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

=== BEST PARAMETERS FOUND ===
{'random_state': 42, 'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None, 'class_weight': 'balanced_subsample'}
Best Cross-Validation F1 Score: 0.9369

Best model: RandomForestClassifier(class_weight='balanced_subsample', random_state=42)

=== TRAINING BOTH MODELS FOR COMPARISON ===
Training original balanced model...
Best tuned model is already trained from GridSearch!


In [40]:
# Generate predictions here!    

print("=== GENERATING PREDICTIONS FOR ALL MODELS ===")

# ORIGINAL MODEL predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# BALANCED MODEL predictions (with SMOTE)
y_train_pred_balanced = model_balanced.predict(X_train_balanced)
y_test_pred_balanced = model_balanced.predict(X_test)

# TUNED MODEL predictions (with SMOTE + hyperparameter tuning)
y_train_pred_tuned = model_tuned.predict(X_train_balanced)
y_test_pred_tuned = model_tuned.predict(X_test)

print("Original model predictions shape:", y_test_pred.shape)
print("Balanced model predictions shape:", y_test_pred_balanced.shape)
print("Tuned model predictions shape:", y_test_pred_tuned.shape)

print("\nFirst 10 predictions comparison:")
print(f"Original: {y_test_pred[:10]}")
print(f"Balanced: {y_test_pred_balanced[:10]}")
print(f"Tuned:    {y_test_pred_tuned[:10]}")

=== GENERATING PREDICTIONS FOR ALL MODELS ===
Original model predictions shape: (3652,)
Balanced model predictions shape: (3652,)
Tuned model predictions shape: (3652,)

First 10 predictions comparison:
Original: [0 0 0 0 0 0 0 0 0 0]
Balanced: [0 0 0 0 0 0 0 1 0 0]
Tuned:    [0 0 0 0 0 1 0 1 0 0]


In [41]:
# Calculate performance metrics here!

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# ==================== ALL MODEL METRICS ====================
print("=== COMPLETE MODEL PERFORMANCE COMPARISON ===")

# Original model metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

# Balanced model metrics
test_accuracy_balanced = accuracy_score(y_test, y_test_pred_balanced)
test_precision_balanced = precision_score(y_test, y_test_pred_balanced)
test_recall_balanced = recall_score(y_test, y_test_pred_balanced)
test_f1_balanced = f1_score(y_test, y_test_pred_balanced)

# Tuned model metrics
test_accuracy_tuned = accuracy_score(y_test, y_test_pred_tuned)
test_precision_tuned = precision_score(y_test, y_test_pred_tuned)
test_recall_tuned = recall_score(y_test, y_test_pred_tuned)
test_f1_tuned = f1_score(y_test, y_test_pred_tuned)

# Display results
print(f"ORIGINAL MODEL (Imbalanced Data):")
print(f"  Accuracy: {test_accuracy:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1: {test_f1:.4f}")

print(f"\nBALANCED MODEL (With SMOTE):")
print(f"  Accuracy: {test_accuracy_balanced:.4f}, Precision: {test_precision_balanced:.4f}, Recall: {test_recall_balanced:.4f}, F1: {test_f1_balanced:.4f}")

print(f"\nTUNED MODEL (SMOTE + Hyperparameter Tuning):")
print(f"  Accuracy: {test_accuracy_tuned:.4f}, Precision: {test_precision_tuned:.4f}, Recall: {test_recall_tuned:.4f}, F1: {test_f1_tuned:.4f}")

# Show improvement
print(f"\n🎯 FINAL IMPROVEMENT:")
print(f"  Recall: {test_recall:.1%} → {test_recall_balanced:.1%} → {test_recall_tuned:.1%}")
print(f"  F1 Score: {test_f1:.4f} → {test_f1_balanced:.4f} → {test_f1_tuned:.4f}")

# Confusion matrices
cm_original = confusion_matrix(y_test, y_test_pred)
cm_balanced = confusion_matrix(y_test, y_test_pred_balanced)
cm_tuned = confusion_matrix(y_test, y_test_pred_tuned)

print(f"\n=== CONFUSION MATRIX COMPARISON ===")
print(f"Original:  Missed {cm_original[1,0]}, Caught {cm_original[1,1]} churners")
print(f"Balanced:  Missed {cm_balanced[1,0]}, Caught {cm_balanced[1,1]} churners")
print(f"Tuned:     Missed {cm_tuned[1,0]}, Caught {cm_tuned[1,1]} churners")

print(f"\n=== FINAL TUNED MODEL CLASSIFICATION REPORT ===")
print(classification_report(y_test, y_test_pred_tuned, target_names=['Not Churn', 'Churn']))

=== COMPLETE MODEL PERFORMANCE COMPARISON ===
ORIGINAL MODEL (Imbalanced Data):
  Accuracy: 0.9031, Precision: 0.7143, Recall: 0.0546, F1: 0.1015

BALANCED MODEL (With SMOTE):
  Accuracy: 0.8946, Precision: 0.4240, Recall: 0.1448, F1: 0.2159

TUNED MODEL (SMOTE + Hyperparameter Tuning):
  Accuracy: 0.8949, Precision: 0.4274, Recall: 0.1448, F1: 0.2163

🎯 FINAL IMPROVEMENT:
  Recall: 5.5% → 14.5% → 14.5%
  F1 Score: 0.1015 → 0.2159 → 0.2163

=== CONFUSION MATRIX COMPARISON ===
Original:  Missed 346, Caught 20 churners
Balanced:  Missed 313, Caught 53 churners
Tuned:     Missed 313, Caught 53 churners

=== FINAL TUNED MODEL CLASSIFICATION REPORT ===
              precision    recall  f1-score   support

   Not Churn       0.91      0.98      0.94      3286
       Churn       0.43      0.14      0.22       366

    accuracy                           0.89      3652
   macro avg       0.67      0.56      0.58      3652
weighted avg       0.86      0.89      0.87      3652

