In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import train_test_split


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import warnings
warnings.simplefilter('ignore')

In [4]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

### Read the dataset for churn 

In [5]:
ds = pd.read_csv("/content/drive/MyDrive/НУЛП АСУ /смарт-системи/Інформаційні технології Смарт систем/практичні  роботи/example/data/train.csv")

## Data engineering 

#### Missing data imputation

In [6]:
mean_impute_columns = ['MonthlyRevenue',
 'MonthlyMinutes',
 'TotalRecurringCharge',
 'DirectorAssistedCalls',
 'OverageMinutes',
 'RoamingCalls',
 'PercChangeMinutes',
 'PercChangeRevenues']
mode_impute_columns = [
 'ServiceArea',
 'Handsets',
 'HandsetModels',
 'CurrentEquipmentDays',
 'AgeHH1',
 'AgeHH2']

In [7]:
def impute_na(df, variable, value):

    return df[variable].fillna(value)

In [8]:
# Let's create a dict of mean values

mean_impute_values = dict()
for column in mean_impute_columns:
    mean_impute_values[column] = ds[column].mean()
print(mean_impute_values)

{'MonthlyRevenue': 58.834492346387385, 'MonthlyMinutes': 525.6534161246586, 'TotalRecurringCharge': 46.830087834784145, 'DirectorAssistedCalls': 0.895229018883496, 'OverageMinutes': 40.02778487355328, 'RoamingCalls': 1.2362441296103437, 'PercChangeMinutes': -11.547908445146014, 'PercChangeRevenues': -1.19198500394633}


In [9]:
for column in mean_impute_columns:
    ds[column] = impute_na(ds, column, mean_impute_values[column])

In [10]:
# Let's create a dict of mode values

mode_impute_values = dict()
for column in mode_impute_columns:
    mode_impute_values[column] = ds[column].mode()[0]
print(mode_impute_values)

{'ServiceArea': 'NYCBRO917', 'Handsets': 1.0, 'HandsetModels': 1.0, 'CurrentEquipmentDays': 202.0, 'AgeHH1': 0.0, 'AgeHH2': 0.0}


In [11]:
for column in mode_impute_columns:
    ds[column] = impute_na(ds, column, mode_impute_values[column])

##### Outlier Engineering

In [12]:
outlier_columns = ['MonthlyRevenue', 'MonthlyMinutes', 'TotalRecurringCharge', 
                   'DirectorAssistedCalls', 'OverageMinutes', 'RoamingCalls', 
                   'PercChangeMinutes', 'PercChangeRevenues', 'DroppedCalls', 
                   'BlockedCalls', 'UnansweredCalls', 'CustomerCareCalls', 
                   'ThreewayCalls', 'ReceivedCalls', 'OutboundCalls', 
                   'InboundCalls', 'PeakCallsInOut', 'OffPeakCallsInOut', 
                   'DroppedBlockedCalls', 'CallForwardingCalls', 'CallWaitingCalls', 
                   'MonthsInService', 'UniqueSubs', 'ActiveSubs']


In [13]:
ds = ds[(np.abs(stats.zscore(ds[outlier_columns])) < 4).all(axis=1)]

##### Categorical encoding

In [14]:
cat_columns = ['Churn',
 'ChildrenInHH',
 'HandsetRefurbished',
 'HandsetWebCapable',
 'TruckOwner',
 'RVOwner',
 'Homeownership',
 'BuysViaMailOrder',
 'RespondsToMailOffers',
 'OptOutMailings',
 'NonUSTravel',
 'OwnsComputer',
 'HasCreditCard',
 'NewCellphoneUser',
 'NotNewCellphoneUser',
 'OwnsMotorcycle',
 'MadeCallToRetentionTeam',
 'MaritalStatus',
 'ServiceArea', 'HandsetPrice','CreditRating', 'PrizmCode', 'Occupation',]

In [15]:
map_dicts = dict()
for column in cat_columns:
    ds[column] = ds[column].astype('category')
    map_dicts[column] = dict(zip(ds[column], ds[column].cat.codes))
    ds[column] = ds[column].cat.codes
    

In [16]:
display(ds.sample(15))

Unnamed: 0,CustomerID,Churn,MonthlyRevenue,MonthlyMinutes,TotalRecurringCharge,DirectorAssistedCalls,OverageMinutes,RoamingCalls,PercChangeMinutes,PercChangeRevenues,DroppedCalls,BlockedCalls,UnansweredCalls,CustomerCareCalls,ThreewayCalls,ReceivedCalls,OutboundCalls,InboundCalls,PeakCallsInOut,OffPeakCallsInOut,DroppedBlockedCalls,CallForwardingCalls,CallWaitingCalls,MonthsInService,UniqueSubs,ActiveSubs,ServiceArea,Handsets,HandsetModels,CurrentEquipmentDays,AgeHH1,AgeHH2,ChildrenInHH,HandsetRefurbished,HandsetWebCapable,TruckOwner,RVOwner,Homeownership,BuysViaMailOrder,RespondsToMailOffers,OptOutMailings,NonUSTravel,OwnsComputer,HasCreditCard,RetentionCalls,RetentionOffersAccepted,NewCellphoneUser,NotNewCellphoneUser,ReferralsMadeBySubscriber,IncomeGroup,OwnsMotorcycle,AdjustmentsToCreditRating,HandsetPrice,MadeCallToRetentionTeam,CreditRating,PrizmCode,Occupation,MaritalStatus
3865,3030166,0,103.84,555.0,85.0,0.0,52.0,5.9,-136.0,-20.2,6.3,0.0,45.3,0.3,0.0,158.7,33.7,25.0,136.0,109.7,6.3,0.0,2.0,33,2,2,149,6.0,5.0,100.0,34.0,32.0,0,1,1,1,1,0,1,1,0,0,1,1,0,0,0,1,0,6,0,0,3,0,0,0,3,2
28842,3229070,1,21.83,676.0,20.0,2.23,61.0,0.5,-674.0,-11.8,10.7,0.7,86.0,3.3,0.0,273.0,163.3,70.3,407.0,112.7,11.3,0.0,3.0,17,3,2,597,2.0,2.0,496.0,34.0,34.0,1,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,6,0,0,8,0,0,1,4,2
33719,3268662,1,49.99,168.0,50.0,0.0,0.0,0.0,-41.0,0.0,3.7,3.7,10.0,0.0,0.0,20.6,5.3,0.3,34.3,13.3,7.3,0.0,0.7,13,1,1,140,2.0,2.0,128.0,0.0,0.0,1,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,9,0,0,2,0,1,3,4,1
17092,3133834,0,40.32,344.0,30.0,0.5,26.0,0.0,-30.0,-9.3,7.0,0.7,16.3,2.3,0.7,50.3,7.0,0.3,75.3,20.7,7.7,0.0,0.0,18,1,1,35,1.0,1.0,557.0,0.0,0.0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,6,0,0,15,0,6,2,3,1
3802,3029742,1,26.76,310.0,34.0,0.0,36.0,1.4,-16.0,-14.6,1.3,7.3,6.0,7.7,0.0,81.8,1.3,0.3,105.0,20.3,8.7,0.0,0.0,37,2,2,320,1.0,1.0,1102.0,36.0,36.0,1,0,0,1,0,0,1,1,0,0,1,1,0,0,0,1,0,7,0,0,15,0,0,3,4,2
33550,3267330,1,91.47,1880.0,62.0,8.66,64.0,0.0,-574.0,60.0,40.3,6.3,77.0,8.3,3.0,704.1,75.7,26.0,187.0,271.3,46.7,0.0,14.3,13,1,1,348,1.0,1.0,388.0,32.0,0.0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,3,2,3,1
9932,3078314,0,42.55,855.0,42.0,0.5,9.0,0.0,-1.0,4.2,1.7,8.0,48.7,5.0,0.0,154.1,26.0,0.7,151.7,80.7,9.7,0.0,0.7,29,1,1,325,2.0,2.0,748.0,42.0,0.0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,6,0,0,8,0,1,0,3,0
2958,3023046,0,63.45,585.0,60.0,3.46,0.0,0.0,-85.0,3.5,1.7,5.7,45.7,0.0,0.3,128.8,12.7,0.3,122.7,123.7,7.3,0.0,5.3,36,2,1,128,4.0,4.0,187.0,56.0,34.0,1,0,1,1,0,0,1,1,0,0,0,1,0,0,1,0,0,6,0,1,13,0,2,2,3,1
26422,3208790,1,56.59,437.0,50.0,0.0,22.0,0.0,-37.0,-6.6,3.0,3.3,12.7,0.0,0.0,111.0,20.0,5.3,78.3,54.0,6.3,0.0,2.7,16,1,1,57,1.0,1.0,481.0,28.0,0.0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,6,0,0,15,0,1,2,3,2
38084,3303602,0,70.96,352.0,55.0,0.25,74.0,0.0,-244.0,-26.0,38.3,7.3,11.7,2.7,0.0,54.8,43.7,4.0,59.7,28.0,45.7,0.0,1.0,14,1,1,679,1.0,1.0,395.0,78.0,0.0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,4,0,0,15,0,6,0,4,0


# Model tuning


https://en.wikipedia.org/wiki/Hyperparameter_optimization 

#### Parameters vs Hyperparameters
Let’s now define what are hyperparameters, but before doing that let’s consider the difference between a parameter and a hyperparameter.

A parameter can be considered to be intrinsic or internal to the model and can be obtained after the model has learned from the data. 
Examples of parameters are regression coefficients in linear regression, support vectors in support vector machines and weights in neural networks.

A hyperparameter can be considered to be extrinsic or external to the model and can be set arbitrarily by the practitioner. 
Examples of hyperparameters include the k in k-nearest neighbors, number of trees and maximum number of features in random forest, learning rate and momentum in neural networks, the C and gamma parameters in support vector machines.

#### Hyperparameter tuning
As there are no universal best hyperparameters to use for any given problem, hyperparameters are typically set to default values. However, the optimal set of hyperparameters can be obtained from manual empirical (trial-and-error) hyperparameter search or in an automated fashion via the use of optimization algorithm to maximize the fitness function.

Two common hyperparameter tuning methods include grid search and random search. As the name implies, a grid search entails the creation of a grid of possible hyperparameter values whereby models are iteratively built for all of these hyperparameter combinations in a brute force manner. In a random search, not all hyperparameter combinations are used, but instead each iteration makes use of a random hyperparameter combination.

#### Define target and features columns

In [17]:
y_column = ['Churn'] # target variable
X_columns = ['MonthlyRevenue', 'MonthlyMinutes',
       'TotalRecurringCharge', 'DirectorAssistedCalls', 'OverageMinutes',
       'RoamingCalls', 'PercChangeMinutes', 'PercChangeRevenues',
       'DroppedCalls', 'BlockedCalls', 'UnansweredCalls', 'CustomerCareCalls',
       'ThreewayCalls', 'ReceivedCalls', 'OutboundCalls', 'InboundCalls',
       'PeakCallsInOut', 'OffPeakCallsInOut', 'DroppedBlockedCalls',
       'CallForwardingCalls', 'CallWaitingCalls', 'MonthsInService',
       'UniqueSubs', 'ActiveSubs', 'ServiceArea', 'Handsets', 'HandsetModels',
       'CurrentEquipmentDays', 'AgeHH1', 'AgeHH2', 'ChildrenInHH',
       'HandsetRefurbished', 'HandsetWebCapable', 'TruckOwner', 'RVOwner',
       'Homeownership', 'BuysViaMailOrder', 'RespondsToMailOffers',
       'OptOutMailings', 'NonUSTravel', 'OwnsComputer', 'HasCreditCard',
       'RetentionCalls', 'RetentionOffersAccepted', 'NewCellphoneUser',
       'NotNewCellphoneUser', 'ReferralsMadeBySubscriber', 'IncomeGroup',
       'OwnsMotorcycle', 'AdjustmentsToCreditRating', 'HandsetPrice',
       'MadeCallToRetentionTeam', 'CreditRating', 'PrizmCode', 'Occupation',
       'MaritalStatus']
X = ds[X_columns]
y = ds[y_column]

In [18]:
# Let's say we want to split the data in 80:10:10 for train:valid:test dataset
# In the first step we will split the data in training and remaining dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(37354, 56)
(37354, 1)
(9339, 56)
(9339, 1)


#### Building a Baseline Random Forest Model
Here, we will first start by building a baseline random forest model that will serve as a baseline for comparative purpose with the model using the optimal set of hyperparameters.
For the baseline model, we will set an arbitrary number for the 2 hyperparameters (e.g. n_estimators and max_features) that we will also use in the next section for hyperparameter tuning.

In [19]:
rf = RandomForestClassifier(max_features=5, n_estimators=100)

In [20]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [21]:
print('test set metrics: ', metrics.classification_report(y_test, y_pred))

test set metrics:                precision    recall  f1-score   support

           0       0.72      0.98      0.83      6633
           1       0.60      0.07      0.12      2706

    accuracy                           0.72      9339
   macro avg       0.66      0.52      0.48      9339
weighted avg       0.69      0.72      0.63      9339



In [22]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 5,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

#### Hyperparameter Tuning
Now we will be performing the tuning of hyperparameters of the random forest model. 

n_estimators = number of trees in the foreset

max_features = max number of features considered for splitting a node

max_depth = max number of levels in each decision tree

min_samples_split = min number of data points placed in a node before the node is split

min_samples_leaf = min number of data points allowed in a leaf node

bootstrap = method for sampling data points (with or without replacement)


https://towardsdatascience.com/hyperparameter-tuning-always-tune-your-models-7db7aeaf47e9

In [23]:
%%time
# Create the random grid
param_grid = {'n_estimators': np.arange(25,55,10),
               'max_features': [0.5, 0.6, 0.8],
               'min_samples_split': [10,15],
               'min_samples_leaf': [3,4],
               'bootstrap': [False]}

# print(random_grid)


rf = RandomForestClassifier()

grid = GridSearchCV(estimator=rf, 
                    param_grid=param_grid, 
                    scoring='precision', 
                    cv=5,
                    verbose = 3,
                    return_train_score=True)

grid.fit(X_train, y_train)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=25;, score=(train=1.000, test=0.497) total time=  11.4s
[CV 2/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=25;, score=(train=1.000, test=0.511) total time=  11.4s
[CV 3/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=25;, score=(train=1.000, test=0.490) total time=  12.1s
[CV 4/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=25;, score=(train=1.000, test=0.503) total time=  11.7s
[CV 5/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=25;, score=(train=1.000, test=0.500) total time=  12.1s
[CV 1/5] END bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=10, n_estimators=35;, score=(train=1.000, test=0.521) tot

In [24]:
grid_results = pd.concat([pd.DataFrame(grid.cv_results_["params"]),
                          pd.DataFrame(grid.cv_results_["mean_test_score"], 
                          columns=["precision"])],
                          axis=1)

grid_results

Unnamed: 0,bootstrap,max_features,min_samples_leaf,min_samples_split,n_estimators,precision
0,False,0.5,3,10,25,0.500409
1,False,0.5,3,10,35,0.509282
2,False,0.5,3,10,45,0.523058
3,False,0.5,3,15,25,0.49974
4,False,0.5,3,15,35,0.499457
5,False,0.5,3,15,45,0.516006
6,False,0.5,4,10,25,0.49844
7,False,0.5,4,10,35,0.516017
8,False,0.5,4,10,45,0.5179
9,False,0.5,4,15,25,0.511928


In [25]:
grid_results.columns

Index(['bootstrap', 'max_features', 'min_samples_leaf', 'min_samples_split',
       'n_estimators', 'precision'],
      dtype='object')

In [27]:
grid_contour = grid_results.groupby([  'max_features', 'min_samples_leaf',
       'min_samples_split', 'n_estimators']).mean()
grid_contour

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,bootstrap,precision
max_features,min_samples_leaf,min_samples_split,n_estimators,Unnamed: 4_level_1,Unnamed: 5_level_1
0.5,3,10,25,0.0,0.500409
0.5,3,10,35,0.0,0.509282
0.5,3,10,45,0.0,0.523058
0.5,3,15,25,0.0,0.49974
0.5,3,15,35,0.0,0.499457
0.5,3,15,45,0.0,0.516006
0.5,4,10,25,0.0,0.49844
0.5,4,10,35,0.0,0.516017
0.5,4,10,45,0.0,0.5179
0.5,4,15,25,0.0,0.511928
