# Feature Engineering

---

1. Import packages
2. Load data
3. Feature engineering

---

## 1. Import packages

In [76]:
# DATA MANIPULATION
import os
import copy
import pandas as pd
import numpy as np

# DATA VIZ
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("fivethirtyeight")
plt.rcParams["figure.figsize"] = [8, 5]
plt.rcParams["figure.dpi"] = 100
plt.rcParams["figure.facecolor"] = "white"

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# MACHINE LEARNING MODELLING
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler

import warnings
warnings.filterwarnings('ignore')
sns.set(color_codes=True)

In [77]:
POWERCO_COLOURS = ["#0072CE", "#B4B4B3", "#79B8F3", "#FDB927", "#F7941D", "#4CB748", "#2E3192"]
DIVERGENT_COLOUR_GRADIENT = ["#e2f1fc", "#b9dcfa", "#8cc7f7", "#5eb1f3", "#39a0f1", "#0691ef"]
sns.set_palette(POWERCO_COLOURS)

---
## 2. Load data

In [78]:
df = pd.read_csv('./clean_data_after_eda.csv')
df["date_activ"] = pd.to_datetime(df["date_activ"], format='%Y-%m-%d')
df["date_end"] = pd.to_datetime(df["date_end"], format='%Y-%m-%d')
df["date_modif_prod"] = pd.to_datetime(df["date_modif_prod"], format='%Y-%m-%d')
df["date_renewal"] = pd.to_datetime(df["date_renewal"], format='%Y-%m-%d')

In [82]:
def map_column_to_integers(df, column_names):
    for column_name in column_names:
      factorized_values, unique_values = pd.factorize(df[column_name][df[column_name] != 'MISSING'])

      mapping = dict(zip(unique_values, range(len(unique_values))))
      mapping['MISSING'] = -1

      df[column_name] = df[column_name].map(mapping)
    return df

In [83]:
df.head(3)

Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,date_activ,date_end,date_modif_prod,date_renewal,forecast_cons_12m,...,var_6m_price_off_peak_var,var_6m_price_peak_var,var_6m_price_mid_peak_var,var_6m_price_off_peak_fix,var_6m_price_peak_fix,var_6m_price_mid_peak_fix,var_6m_price_off_peak,var_6m_price_peak,var_6m_price_mid_peak,churn
0,24011ae4ebbe3035111d65fa7c15bc57,foosdfpfkusacimwkcsosbicdxkicaua,0,54946,0,2013-06-15,2016-06-15,2015-11-01,2015-06-23,0.0,...,0.000131,4.100838e-05,0.000908,2.086294,99.530517,44.235794,2.086425,99.53056,44.236702,1
1,d29c2c54acc38ff3c0614d0a653813dd,MISSING,4660,0,0,2009-08-21,2016-08-30,2009-08-21,2015-08-31,189.95,...,3e-06,0.001217891,0.0,0.009482,0.0,0.0,0.009485,0.001217891,0.0,0
2,764c75f661154dac3a6c254cd082ea7d,foosdfpfkusacimwkcsosbicdxkicaua,544,0,0,2010-04-16,2016-04-16,2010-04-16,2015-04-17,47.96,...,4e-06,9.45015e-08,0.0,0.0,0.0,0.0,4e-06,9.45015e-08,0.0,0


In [84]:
df.columns

Index(['id', 'channel_sales', 'cons_12m', 'cons_gas_12m', 'cons_last_month',
       'date_activ', 'date_end', 'date_modif_prod', 'date_renewal',
       'forecast_cons_12m', 'forecast_cons_year', 'forecast_discount_energy',
       'forecast_meter_rent_12m', 'forecast_price_energy_off_peak',
       'forecast_price_energy_peak', 'forecast_price_pow_off_peak', 'has_gas',
       'imp_cons', 'margin_gross_pow_ele', 'margin_net_pow_ele', 'nb_prod_act',
       'net_margin', 'num_years_antig', 'origin_up', 'pow_max',
       'var_year_price_off_peak_var', 'var_year_price_peak_var',
       'var_year_price_mid_peak_var', 'var_year_price_off_peak_fix',
       'var_year_price_peak_fix', 'var_year_price_mid_peak_fix',
       'var_year_price_off_peak', 'var_year_price_peak',
       'var_year_price_mid_peak', 'var_6m_price_off_peak_var',
       'var_6m_price_peak_var', 'var_6m_price_mid_peak_var',
       'var_6m_price_off_peak_fix', 'var_6m_price_peak_fix',
       'var_6m_price_mid_peak_fix', 'var_6m_p

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14606 entries, 0 to 14605
Data columns (total 44 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              14606 non-null  object        
 1   channel_sales                   14606 non-null  object        
 2   cons_12m                        14606 non-null  int64         
 3   cons_gas_12m                    14606 non-null  int64         
 4   cons_last_month                 14606 non-null  int64         
 5   date_activ                      14606 non-null  datetime64[ns]
 6   date_end                        14606 non-null  datetime64[ns]
 7   date_modif_prod                 14606 non-null  datetime64[ns]
 8   date_renewal                    14606 non-null  datetime64[ns]
 9   forecast_cons_12m               14606 non-null  float64       
 10  forecast_cons_year              14606 non-null  int64         
 11  fo

In [86]:
client_date_cols = [
    'date_activ',
    'date_end',
    'date_modif_prod',
    'date_renewal'
]
object_cols = df.select_dtypes(include='object').columns.difference(['id'] + client_date_cols)

for col in object_cols:
  print(f"{col}: {df[col].unique()}\n")

channel_sales: ['foosdfpfkusacimwkcsosbicdxkicaua' 'MISSING'
 'lmkebamcaaclubfxadlmueccxoimlema' 'usilxuppasemubllopkaafesmlibmsdf'
 'ewpakwlliwisiwduibdlfmalxowmwpci' 'epumfxlbckeskwekxbiuasklxalciiuu'
 'sddiedcslfslkckwlfkdpoeeailfpeds' 'fixdbufsefwooaasfcxdxadsiekoceaa']

has_gas: ['t' 'f']

origin_up: ['lxidpiddsbxsbosboudacockeimpuepw' 'kamkkxfxxuwbdslkwifmmcsiusiuosws'
 'ldkssxwpmemidmecebumciepifcamkci' 'MISSING'
 'usapbepcfoloekilkwsdiboslwaxobdp' 'ewxeelcelemmiwuafmddpobolfuxioce']



In [87]:
df = map_column_to_integers(df, object_cols)

In [88]:
for col in object_cols:
  print(f"{col}: {df[col].unique()}\n")

channel_sales: [ 0 -1  1  2  3  4  5  6]

has_gas: [0 1]

origin_up: [ 0  1  2 -1  3  4]



In [89]:
cp_df = copy.deepcopy(df)
cp_df[client_date_cols] = cp_df[client_date_cols].apply(lambda x: x.astype(int) / 10**9)
cp_df.columns

Index(['id', 'channel_sales', 'cons_12m', 'cons_gas_12m', 'cons_last_month',
       'date_activ', 'date_end', 'date_modif_prod', 'date_renewal',
       'forecast_cons_12m', 'forecast_cons_year', 'forecast_discount_energy',
       'forecast_meter_rent_12m', 'forecast_price_energy_off_peak',
       'forecast_price_energy_peak', 'forecast_price_pow_off_peak', 'has_gas',
       'imp_cons', 'margin_gross_pow_ele', 'margin_net_pow_ele', 'nb_prod_act',
       'net_margin', 'num_years_antig', 'origin_up', 'pow_max',
       'var_year_price_off_peak_var', 'var_year_price_peak_var',
       'var_year_price_mid_peak_var', 'var_year_price_off_peak_fix',
       'var_year_price_peak_fix', 'var_year_price_mid_peak_fix',
       'var_year_price_off_peak', 'var_year_price_peak',
       'var_year_price_mid_peak', 'var_6m_price_off_peak_var',
       'var_6m_price_peak_var', 'var_6m_price_mid_peak_var',
       'var_6m_price_off_peak_fix', 'var_6m_price_peak_fix',
       'var_6m_price_mid_peak_fix', 'var_6m_p

---

## 3. Feature engineering

### Difference between off-peak prices in December and preceding January

Below is the code created by your colleague to calculate the feature described above. Use this code to re-create this feature and then think about ways to build on this feature to create features with a higher predictive power.

In [90]:
price_df = pd.read_csv('price_data.csv')
price_df["price_date"] = pd.to_datetime(price_df["price_date"], format='%Y-%m-%d')
price_df.head()

Unnamed: 0,id,price_date,price_off_peak_var,price_peak_var,price_mid_peak_var,price_off_peak_fix,price_peak_fix,price_mid_peak_fix
0,038af19179925da21a25619c5a24b745,2015-01-01,0.151367,0.0,0.0,44.266931,0.0,0.0
1,038af19179925da21a25619c5a24b745,2015-02-01,0.151367,0.0,0.0,44.266931,0.0,0.0
2,038af19179925da21a25619c5a24b745,2015-03-01,0.151367,0.0,0.0,44.266931,0.0,0.0
3,038af19179925da21a25619c5a24b745,2015-04-01,0.149626,0.0,0.0,44.266931,0.0,0.0
4,038af19179925da21a25619c5a24b745,2015-05-01,0.149626,0.0,0.0,44.266931,0.0,0.0


In [91]:
# Group off-peak prices by companies and month
monthly_price_by_id = price_df.groupby(['id', 'price_date']).agg({'price_off_peak_var': 'mean', 'price_off_peak_fix': 'mean'}).reset_index()

# Get january and december prices
jan_prices = monthly_price_by_id.groupby('id').first().reset_index()
dec_prices = monthly_price_by_id.groupby('id').last().reset_index()

# Calculate the difference
diff = pd.merge(dec_prices.rename(columns={'price_off_peak_var': 'dec_1', 'price_off_peak_fix': 'dec_2'}), jan_prices.drop(columns='price_date'), on='id')
diff['offpeak_diff_dec_january_energy'] = diff['dec_1'] - diff['price_off_peak_var']
diff['offpeak_diff_dec_january_power'] = diff['dec_2'] - diff['price_off_peak_fix']
diff = diff[['id', 'offpeak_diff_dec_january_energy','offpeak_diff_dec_january_power']]
diff.head()

Unnamed: 0,id,offpeak_diff_dec_january_energy,offpeak_diff_dec_january_power
0,0002203ffbb812588b632b9e628cc38d,-0.006192,0.162916
1,0004351ebdd665e6ee664792efc4fd13,-0.004104,0.177779
2,0010bcc39e42b3c2131ed2ce55246e3c,0.050443,1.5
3,0010ee3855fdea87602a5b7aba8e42de,-0.010018,0.162916
4,00114d74e963e47177db89bc70108537,-0.003994,-1e-06


In [92]:
diff = pd.merge(diff, df[['id', 'churn']], on='id', how='inner')
diff.head()

Unnamed: 0,id,offpeak_diff_dec_january_energy,offpeak_diff_dec_january_power,churn
0,0002203ffbb812588b632b9e628cc38d,-0.006192,0.162916,0
1,0004351ebdd665e6ee664792efc4fd13,-0.004104,0.177779,0
2,0010bcc39e42b3c2131ed2ce55246e3c,0.050443,1.5,0
3,00114d74e963e47177db89bc70108537,-0.003994,-1e-06,0
4,0013f326a839a2f6ad87a1859952d227,-0.006171,0.0,0


In [93]:
fig = px.scatter(
    diff,
    x = ['offpeak_diff_dec_january_energy', 'offpeak_diff_dec_january_power'],
    y = 'churn',
    title='<b>Difference between off-peak prices in December and preceding January<b>',
)

fig.update_layout(
    xaxis_title='Consumption Difference',
    yaxis_title='Churn',
    legend=dict(x=0, y=1, bgcolor='rgba(0, 0, 0, 0)', title='Consumption'),
    annotations=[
    dict(
        x=0,
        y=1.1,
        xref='paper',
        yref='paper',
        text='No Apparent Relationship with Churn',
        showarrow=False,
        font=dict(size=16)
      )
    ],
)

fig.for_each_trace(
    lambda t: t.update(name='Energy') if t.name == 'offpeak_diff_dec_january_energy'
    else t.update(name='Power')
)


fig.show()

In [94]:
df = pd.merge(df, diff[['id', 'offpeak_diff_dec_january_energy', 'offpeak_diff_dec_january_power']], on='id', how='inner')
cp_df = pd.merge(cp_df, diff[['id', 'offpeak_diff_dec_january_energy', 'offpeak_diff_dec_january_power']], on='id', how='inner')
df.head()

Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,date_activ,date_end,date_modif_prod,date_renewal,forecast_cons_12m,...,var_6m_price_mid_peak_var,var_6m_price_off_peak_fix,var_6m_price_peak_fix,var_6m_price_mid_peak_fix,var_6m_price_off_peak,var_6m_price_peak,var_6m_price_mid_peak,churn,offpeak_diff_dec_january_energy,offpeak_diff_dec_january_power
0,24011ae4ebbe3035111d65fa7c15bc57,0,0,54946,0,2013-06-15,2016-06-15,2015-11-01,2015-06-23,0.0,...,0.0009084737,2.086294,99.530517,44.235794,2.086425,99.53056,44.2367,1,0.020057,3.700961
1,d29c2c54acc38ff3c0614d0a653813dd,-1,4660,0,0,2009-08-21,2016-08-30,2009-08-21,2015-08-31,189.95,...,0.0,0.009482,0.0,0.0,0.009485,0.001217891,0.0,0,-0.003767,0.177779
2,764c75f661154dac3a6c254cd082ea7d,0,544,0,0,2010-04-16,2016-04-16,2010-04-16,2015-04-17,47.96,...,0.0,0.0,0.0,0.0,4e-06,9.45015e-08,0.0,0,-0.00467,0.177779
3,bba03439a292a1e166f80264c16191cb,1,1584,0,0,2010-03-30,2016-03-30,2010-03-30,2015-03-31,240.04,...,0.0,0.0,0.0,0.0,3e-06,0.0,0.0,0,-0.004547,0.177779
4,149d57cf92fc41cf94415803a877cb4b,-1,4425,0,526,2010-01-13,2016-03-07,2010-01-13,2015-03-09,445.75,...,4.86e-10,0.0,0.0,0.0,1.1e-05,2.89676e-06,4.86e-10,0,-0.006192,0.162916


In the previous notebook, I tried to think of possible features that can be used to predict churn to no avail. The suggestion above from Estelle sounded clever, but really doesn't seem like it will have any predictive power.

I am thus going to throw all of these variables in a Random Forest classifier and to see if it will be able to identify any important features. This is another attempt at feature selection as the random forest is capable of identifying nonlinear relationships.

# 4. Baseline Model

## Model Helpers

In [95]:
def basic_rf_eval(model, test_data, true_y):
  pred_y = model.predict(test_data)

  accuracy = accuracy_score(true_y, pred_y)
  confusion_matrix_ = confusion_matrix(true_y, pred_y)
  classification_report_ = classification_report(true_y, pred_y)

  print(f'Accuracy = {accuracy.round(2)};\n\
    \nConfusion Matrix: \n{confusion_matrix_};\n\
    \nClassification Report:\n{classification_report_}'
  )

In [96]:
def create_feature_importance_df(rf_model):
  feature_list = list(X.columns)
  importances = list(rf_model.feature_importances_)

  feature_importances = [
      (feature, round(importances, 2)) for feature, importances in zip(feature_list, importances)
  ]

  importance_df = pd.DataFrame(feature_importances, columns=['Variable', 'Importance'])

  return importance_df

In [97]:
def view_feature_importances(importance_df):
  fig = px.bar(
      importance_df,
      x='Variable',
      y='Importance',
      labels={'Variable': 'Variable', 'Importance': 'Importance'},
      title='Variable Importances',
      orientation='h'
  )

  fig.update_layout(
      xaxis={'categoryorder': 'total descending'},
      yaxis_title='Importances',
      xaxis_title='Variables',
      title={'x': 0.5}
  )

  fig.show()

## Iterating on Models

In [102]:
X = cp_df.drop(columns=['id', 'churn'])
y = cp_df['churn']

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
baseline_rf_model = RandomForestClassifier(random_state=42)
baseline_rf_model.fit(X_train, y_train)

In [104]:
basic_rf_eval(baseline_rf_model, X_test, y_test)

Accuracy = 0.9;
    
Confusion Matrix: 
[[2614    3]
 [ 287   18]];
    
Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      2617
           1       0.86      0.06      0.11       305

    accuracy                           0.90      2922
   macro avg       0.88      0.53      0.53      2922
weighted avg       0.90      0.90      0.86      2922



In [105]:
baseline_importance_df = create_feature_importance_df(baseline_rf_model)
view_feature_importances(baseline_importance_df)

This is a very bad model as it has extremely low sensitivity to the actual class we're interested in i.e. class 1(Churn). I want to assume that the imbalanced nature of the data is one of the major reasons why. Intuitively, I think a sampling strategy can work. There's also weighting classes and using the **BalancedRandomForestClassifier**(both of which I trust less) that I'm not sure of how it works. Let's play around and see what we get.

If I can achieve reasonable improve by just samples, then I'd be very happy. To improve the model further, I think feature engineering and hyperparameter tuning will help. In fact, I believe the order should be: feature engineering(which I tried in the previous notebook), deal with the balance of the dataset then finally tune the hyperparameters.

In [None]:
balanced_baseline = BalancedRandomForestClassifier(random_state=42)
balanced_baseline.fit(X_train, y_train)

In [None]:
basic_rf_eval(balanced_baseline, X_test, y_test)

Accuracy = 0.65;
    
Confusion Matrix: 
[[1744  873]
 [ 138  167]];
    
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.67      0.78      2617
           1       0.16      0.55      0.25       305

    accuracy                           0.65      2922
   macro avg       0.54      0.61      0.51      2922
weighted avg       0.85      0.65      0.72      2922



Now we're fighting the battle of tilting the scales... The same way the recall for Churn has improved is the same way that we've increase Type I and II errors😆😆😆😆😆

In [None]:
undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)
undersampled_baseline = RandomForestClassifier(random_state=42)
undersampled_baseline.fit(X_resampled, y_resampled)
basic_rf_eval(undersampled_baseline, X_resampled, y_resampled)

Accuracy = 1.0;
    
Confusion Matrix: 
[[1419    0]
 [   0 1419]];
    
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1419
           1       1.00      1.00      1.00      1419

    accuracy                           1.00      2838
   macro avg       1.00      1.00      1.00      2838
weighted avg       1.00      1.00      1.00      2838



100% everything??? Kanjani? Just how??? 😆😆😆😆😆

Perhaps the sampling strategy was wrong, or the model is just overfit. No ways this is good model.

In [None]:
undersampled_importance_df = create_feature_importance_df(undersampled_baseline)
view_feature_importances(undersampled_importance_df)

**GOODNESS GRACIOUS, NO FEATURE IS USEFUL** This is no surprise though, we couldn't get anything useful out of a fawl model

# 3. Back To Feature Engineering

## Average price changes across periods

We can now enhance the feature that our colleague made by calculating the average price changes across individual periods, instead of the entire year.

In [109]:
price_cols = ['price_off_peak_var', 'price_peak_var', 'price_mid_peak_var',
              'price_off_peak_fix', 'price_peak_fix', 'price_mid_peak_fix']

mean_prices = price_df.groupby('id')[price_cols].mean().reset_index()

column_pairs = {
    'off_peak_peak_var_mean_diff': ['price_off_peak_var', 'price_peak_var'],
    'peak_mid_peak_var_mean_diff': ['price_peak_var', 'price_mid_peak_var'],
    'off_peak_mid_peak_var_mean_diff': ['price_off_peak_var', 'price_mid_peak_var'],
    'off_peak_peak_fix_mean_diff': ['price_off_peak_fix', 'price_peak_fix'],
    'peak_mid_peak_fix_mean_diff': ['price_peak_fix', 'price_mid_peak_fix'],
    'off_peak_mid_peak_fix_mean_diff': ['price_off_peak_fix', 'price_mid_peak_fix']
}

for new_col, col_pair in column_pairs.items():
    mean_prices[new_col] = mean_prices[col_pair[0]] - mean_prices[col_pair[1]]


In [110]:
mean_prices.head()

Unnamed: 0,id,price_off_peak_var,price_peak_var,price_mid_peak_var,price_off_peak_fix,price_peak_fix,price_mid_peak_fix,off_peak_peak_var_mean_diff,peak_mid_peak_var_mean_diff,off_peak_mid_peak_var_mean_diff,off_peak_peak_fix_mean_diff,peak_mid_peak_fix_mean_diff,off_peak_mid_peak_fix_mean_diff
0,0002203ffbb812588b632b9e628cc38d,0.124338,0.103794,0.07316,40.701732,24.421038,16.280694,0.020545,0.030633,0.051178,16.280694,8.140345,24.421038
1,0004351ebdd665e6ee664792efc4fd13,0.146426,0.0,0.0,44.38545,0.0,0.0,0.146426,0.0,0.146426,44.38545,0.0,44.38545
2,0010bcc39e42b3c2131ed2ce55246e3c,0.181558,0.0,0.0,45.31971,0.0,0.0,0.181558,0.0,0.181558,45.31971,0.0,45.31971
3,0010ee3855fdea87602a5b7aba8e42de,0.118757,0.098292,0.069032,40.647427,24.388455,16.258971,0.020465,0.02926,0.049725,16.258972,8.129484,24.388456
4,00114d74e963e47177db89bc70108537,0.147926,0.0,0.0,44.26693,0.0,0.0,0.147926,0.0,0.147926,44.26693,0.0,44.26693


# 4. Hyperparameter Tuning