# 1. Global Variables

## Packages

In [83]:
# DATA MANIPULATION
import os
import copy
import pandas as pd
import numpy as np
from datetime import datetime
import time

# DATA VIZ
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("fivethirtyeight")
plt.rcParams["figure.figsize"] = [8, 5]
plt.rcParams["figure.dpi"] = 100
plt.rcParams["figure.facecolor"] = "white"

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# MACHINE LEARNING MODELLING
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler

import warnings
warnings.filterwarnings('ignore')
sns.set(color_codes=True)

In [55]:
POWERCO_COLOURS = ["#0072CE", "#B4B4B3", "#79B8F3", "#FDB927", "#F7941D", "#4CB748", "#2E3192"]
DIVERGENT_COLOUR_GRADIENT = ["#e2f1fc", "#b9dcfa", "#8cc7f7", "#5eb1f3", "#39a0f1", "#0691ef"]
sns.set_palette(POWERCO_COLOURS)

## Helpers

In [56]:
def calculate_and_view_vif(data, columns):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = columns
    vif_data["VIF"] = [variance_inflation_factor(data[columns].values, i) for i in range(len(columns))]

    fig = px.bar(
      vif_data.sort_values(by='VIF', ascending=False),
      x='Variable',
      y='VIF',
      title="Variables' Variance Inflation Factor"
    )

    fig.update_layout(
        xaxis_title='Variables',
        yaxis_title='Variance Inflation Factor'
    )

    fig.show()


### Model Helpers

In [72]:
def basic_rf_eval(model, test_data, true_y):
  pred_y = model.predict(test_data)

  accuracy = accuracy_score(true_y, pred_y)
  confusion_matrix_ = confusion_matrix(true_y, pred_y)
  classification_report_ = classification_report(true_y, pred_y)
  precision = metrics.precision_score(true_y, pred_y)

  print(f'Accuracy = {accuracy.round(2)};\n\
    \nPrecision = {precision};\n\
    \nConfusion Matrix: \n{confusion_matrix_};\n\
    \nClassification Report:\n{classification_report_}'
  )

In [58]:
def create_feature_importance_df(rf_model, feature_list):
  feature_list = list(feature_list)
  importances = list(rf_model.feature_importances_)

  feature_importances = [
      (feature, round(importances, 2)) for feature, importances in zip(feature_list, importances)
  ]

  importance_df = pd.DataFrame(feature_importances, columns=['Variable', 'Importance'])

  return importance_df

In [59]:
def view_feature_importances(importance_df):
  fig = px.bar(
      importance_df,
      x='Variable',
      y='Importance',
      labels={'Variable': 'Variable', 'Importance': 'Importance'},
      title='Variable Importances',
  )

  fig.update_layout(
      xaxis={'categoryorder': 'total descending'},
      yaxis_title='Importances',
      xaxis_title='Variables',
      title={'x': 0.5}
  )

  fig.show()

# 2. Load data

In [60]:
df = pd.read_csv('./data_for_predictions.csv')
df.drop(columns=["Unnamed: 0"], inplace=True)
df.head()

Unnamed: 0,id,cons_12m,cons_gas_12m,cons_last_month,forecast_cons_12m,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_off_peak,forecast_price_energy_peak,forecast_price_pow_off_peak,...,months_modif_prod,months_renewal,channel_MISSING,channel_ewpakwlliwisiwduibdlfmalxowmwpci,channel_foosdfpfkusacimwkcsosbicdxkicaua,channel_lmkebamcaaclubfxadlmueccxoimlema,channel_usilxuppasemubllopkaafesmlibmsdf,origin_up_kamkkxfxxuwbdslkwifmmcsiusiuosws,origin_up_ldkssxwpmemidmecebumciepifcamkci,origin_up_lxidpiddsbxsbosboudacockeimpuepw
0,24011ae4ebbe3035111d65fa7c15bc57,0.0,4.739944,0.0,0.0,0.0,0.444045,0.114481,0.098142,40.606701,...,2,6,0,0,1,0,0,0,0,1
1,d29c2c54acc38ff3c0614d0a653813dd,3.668479,0.0,0.0,2.28092,0.0,1.237292,0.145711,0.0,44.311378,...,76,4,1,0,0,0,0,1,0,0
2,764c75f661154dac3a6c254cd082ea7d,2.736397,0.0,0.0,1.689841,0.0,1.599009,0.165794,0.087899,44.311378,...,68,8,0,0,1,0,0,1,0,0
3,bba03439a292a1e166f80264c16191cb,3.200029,0.0,0.0,2.382089,0.0,1.318689,0.146694,0.0,44.311378,...,69,9,0,0,0,1,0,1,0,0
4,149d57cf92fc41cf94415803a877cb4b,3.646011,0.0,2.721811,2.650065,0.0,2.122969,0.1169,0.100015,40.606701,...,71,9,1,0,0,0,0,1,0,0


# 3. Feature Selection

Our feature engineering has provided us almost 70 variables for this problem of ours and we now have to choose that ones we think will be useful for our model. As we do this, we have to be careful of **`multi-collinearity`**, meaning we won't just throw variables with high correlation.

That said, we're going to do feature selection in this manner:
- [x] Identify variables that reasonably correlated with churn
- [ ] Use the Random Forest algorithm to identify important features
- [ ] Use the Variance Inflation Factor to measure variables' multi-collinearity

In [61]:
df_corr = df.drop(columns=['id'], inplace=False).corr()

In [62]:
fig = px.imshow(
    df_corr,
    labels=dict(x='Features', y='Features', color='Correlation'),
    x=df_corr.columns,
    y=df_corr.columns,
    color_continuous_scale=DIVERGENT_COLOUR_GRADIENT,
    title='Correlation Matrix'
  )

fig.update_layout(
    width=1000,
    height=1000,
    # xaxis=dict(
    #       tickmode='array',
    #       tickvals=list(range(len(df_corr.columns))),
    #       ticktext=df_corr.columns,
    #       tickangle=-90
    #     ),
)

fig.show()

In [63]:
churn_corr_df = df.corrwith(df['churn']).sort_values(ascending=False).reset_index()
churn_corr_df.columns = ['Variable', 'Correlation']
churn_corr_df.head()

Unnamed: 0,Variable,Correlation
0,churn,1.0
1,margin_net_pow_ele,0.095772
2,margin_gross_pow_ele,0.095725
3,origin_up_lxidpiddsbxsbosboudacockeimpuepw,0.094131
4,channel_foosdfpfkusacimwkcsosbicdxkicaua,0.075964


In [64]:
fig = px.histogram(
    churn_corr_df[1:],
    x='Correlation',
    title='Correlation of Churn with Other Variables',
)

fig.update_layout(
    xaxis_title='Correlations with Churn',
    yaxis_title='Number of Variables',
)

fig.show()

These correlations are just too little to matter because they're capped at |0.1|. I'm gonna use the same `Random Forest` algorithm to find importance features, then feed them to the algorithm to build a predictive model.

# 4. Model Iterations

So we now have a dataset containing features that we have engineered and we are ready to start training a predictive model.

We are going to focus on a **`Random Forest`** for the following reasons:
- It is easy to understand because it's just an ensamble of decision trees, which are very intuitive, espeically for a classification problem
- It's easy to explain as it can return the importance of features in its decision-making
- The random forest uses a rule-based approach instead of a distance calculation and so features do not need to be scaled
- It is able to handle non-linear parameters better than linear based models

On the flip side, some disadvantages of the random forest classifier include:
- The computational power needed to train a random forest on a large dataset is high, since we need to build a whole ensemble of estimators.
- Training time can be longer due to the increased complexity and size of thee ensemble

In [65]:
train_df = df.copy()

labels = np.array(df['churn'])
features = np.array(df.drop(columns=['id', 'churn']))
feature_list = (df.drop(columns=['id', 'churn'])).columns

train_features, test_features, train_labels, test_labels = train_test_split(
    features,
    labels,
    stratify=labels,
    test_size=0.25,
    random_state=42)

## Base Model

We are now going to create a baseline model with no hyperparameter tuning and using all independent variables in the data.

In [66]:
base_all_vars_model = RandomForestClassifier(n_estimators= 1000, random_state=42)
base_all_vars_model.fit(train_features, train_labels)

Let's now see how this has performed

In [74]:
basic_rf_eval(base_all_vars_model, test_features, test_labels)

Accuracy = 0.91;
    
Precision = 0.7631578947368421;
    
Confusion Matrix: 
[[3288    9]
 [ 326   29]];
    
Classification Report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      3297
           1       0.76      0.08      0.15       355

    accuracy                           0.91      3652
   macro avg       0.84      0.54      0.55      3652
weighted avg       0.90      0.91      0.87      3652



In [75]:
base_all_vars_importance_df = create_feature_importance_df(base_all_vars_model, feature_list)
view_feature_importances(base_all_vars_importance_df)

So the feature engineering was not useful at all because we still have the same problem as initial iterations i.e. the model having very low sensitivity on churned customers. The feature importance is also very low. Could the model be overfit due the use of many useless variables?

I don't feel comfortable making decisions based on a foul model, but I'm just going to try and experiment with the top 8 variables according to the above graph and see if anything will change.

Before throwing these features into the model however, I'm going to calculate VIF of each among them because I'm pretty sure that they're not all independent of each other.

In [76]:
top_features = [
    'net_margin',
    # 'margin_net_pow_ele',
    'margin_gross_pow_ele',
    'forecast_meter_rent_12m',
    # 'forecast_cons_12m',
    'cons_12m',
    'pow_max',
    'cons_last_month'
]

calculate_and_view_vif(df, top_features)

In [77]:
features01 = np.array(df[top_features])

train_features, test_features, train_labels, test_labels = train_test_split(
    features01,
    labels,
    stratify=labels,
    test_size=0.25,
    random_state=42)

In [78]:
base_model = RandomForestClassifier(n_estimators= 1000, random_state=42)
base_model.fit(train_features, train_labels)

In [79]:
basic_rf_eval(base_model, test_features, test_labels)

Accuracy = 0.91;
    
Precision = 0.7575757575757576;
    
Confusion Matrix: 
[[3289    8]
 [ 330   25]];
    
Classification Report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      3297
           1       0.76      0.07      0.13       355

    accuracy                           0.91      3652
   macro avg       0.83      0.53      0.54      3652
weighted avg       0.89      0.91      0.87      3652



In [82]:
base_importance_df = create_feature_importance_df(base_model, top_features)
view_feature_importances(base_importance_df)

Removing some features seems to have done something, i.e. we see some features now being useful, however, at the cost of precision and recall of churned customers.

# 5. Hyperparameter Tuning