# Random forest classifier and tuning hyperparameters

In [27]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

%matplotlib inline

pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
## multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.model_selection import train_test_split
import numpy as np


In [2]:
churnData = pd.read_csv('DATA_Customer-Churn.csv')

In [28]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   float64
 15  Churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(12)
memory

In [4]:
churnData.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [6]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')

In [7]:
churnData['TotalCharges']

0         29.85
1       1889.50
2        108.15
3       1840.75
4        151.65
         ...   
7038    1990.50
7039    7362.90
7040     346.45
7041     306.60
7042    6844.50
Name: TotalCharges, Length: 7043, dtype: float64

In [8]:
numeric_df= churnData.select_dtypes(np.number)
numeric_df

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0,1,29.85,29.85
1,0,34,56.95,1889.50
2,0,2,53.85,108.15
3,0,45,42.30,1840.75
4,0,2,70.70,151.65
...,...,...,...,...
7038,0,24,84.80,1990.50
7039,0,72,103.20,7362.90
7040,0,11,29.60,346.45
7041,1,4,74.40,306.60


### Check whether there are any NaNs

In [9]:
churnData.isnull().values.any()

True

### Filling NaNs with means of the column

In [10]:
churnData = churnData.fillna(churnData.mean())

In [11]:
churnData.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [14]:
X.isna().any()

SeniorCitizen     False
tenure            False
MonthlyCharges    False
TotalCharges      False
dtype: bool

### Looking into the 4 columns: tenure, SeniorCitizen, MonthlyCharges, TotalCharges 

In [12]:
X = numeric_df


In [13]:
X = X.fillna(X.mean())

0        No
1        No
2       Yes
3        No
4       Yes
       ... 
7038     No
7039     No
7040     No
7041    Yes
7042     No
Name: Churn, Length: 7043, dtype: object

In [17]:
y = churnData['Churn']

In [18]:
y

0        No
1        No
2       Yes
3        No
4       Yes
       ... 
7038     No
7039     No
7040     No
7041    Yes
7042     No
Name: Churn, Length: 7043, dtype: object

In [19]:
y.value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [15]:
X.round()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0,1,30.0,30.0
1,0,34,57.0,1890.0
2,0,2,54.0,108.0
3,0,45,42.0,1841.0
4,0,2,71.0,152.0
...,...,...,...,...
7038,0,24,85.0,1990.0
7039,0,72,103.0,7363.0
7040,0,11,30.0,346.0
7041,1,4,74.0,307.0


In [20]:
# target variable is malignant or benign (binary label).
# let's check the distribution of labels
y.value_counts(normalize=True)
#y.value_counts(normalize=False)
#1 is benign 
#0 is malignant 

No     0.73463
Yes    0.26537
Name: Churn, dtype: float64

### Data checks for down or upsampling

In [34]:
numrows = X.shape[0] 
numrows

10348

In [22]:
y.value_counts() #checking the y values, No: 5174, Yes: 1869 
#therefore I will upsample because the data is unbalanced

No     5174
Yes    1869
Name: Churn, dtype: int64

In [23]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [24]:
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

No     5174
Yes    5174
Name: Churn, dtype: int64

In [25]:
y = y_sm
X = X_sm

In [35]:
numrows = X.shape[0] 
numrows
#checking if the smote worked and i upscaled my data

10348

### Train/test splitting 

In [30]:
TT_SPLIT = 0.2     # ratio train/test size
RAND_STATE = 1230  # specifies a sampling for repeatable results

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TT_SPLIT, random_state=RAND_STATE)


### Random forest

In [36]:
# again we need to drop the null (we use the same shuffling)
na_idx = X_train[X_train.isna().any(axis=1)].index
X_train = pd.DataFrame(X_train).drop(na_idx)
y_train = pd.DataFrame(y_train).drop(na_idx)

In [38]:
def down_samp_rand(Xin, yin, ratio=1):
        from imblearn.under_sampling import RandomUnderSampler
        """Downsamples majority class using random sampling.
        Ratio argument is the ratio of minority class to the downsampled majority
        """
        rus = RandomUnderSampler(sampling_strategy=ratio, random_state=RAND_STATE)
        X_rus, y_rus = rus.fit_resample(Xin, yin)
        return X_rus, y_rus

In [39]:
X_train, y_train = down_samp_rand(X_train,y_train)

In [40]:
# check that we have downsampled
y_train.value_counts()

Churn
No       4136
Yes      4136
dtype: int64

In [41]:
from sklearn.ensemble import RandomForestClassifier

rfc_ops = {"max_depth":6,
           "min_samples_leaf":20,
           "max_features":None,
           "n_estimators":100,
           "bootstrap":True,
           "oob_score":True,
           "random_state":RAND_STATE}

clf = RandomForestClassifier(**rfc_ops)

        #max_depth=6,min_samples_leaf=20,max_features=None,n_estimators=100,
         #                    bootstrap=True,oob_score=True, random_state=RAND_STATE)
clf.fit(X_train, y_train)
print("train prediction accuracy score: %.2f" %(clf.score(X_train, y_train)))
print("test prediction accuracy score: %.2f"  %(clf.score(X_test, y_test)))

RandomForestClassifier(max_depth=6, max_features=None, min_samples_leaf=20,
                       oob_score=True, random_state=1230)

train prediction accuracy score: 0.77
test prediction accuracy score: 0.76


In [42]:
from sklearn.metrics import accuracy_score
score_ds = accuracy_score(y_test,clf.predict(X_test))

### Cross validation

In [43]:
from sklearn.model_selection import cross_val_score
folds=5
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=folds)


In [44]:
print("cv scores over {:d} iterations: \n".format(folds))
cross_val_scores

cv scores over 5 iterations: 



array([0.76858006, 0.74864048, 0.76481258, 0.74909311, 0.74607013])

In [45]:
print("the std. dev. in the cv scores is {:.4f}".format(np.std(cross_val_scores)))

the std. dev. in the cv scores is 0.0093


### Hyper parameter tuning

In [49]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100,500],
    'min_samples_split': [2, 4],
    'min_samples_leaf' : [1, 2],
    'max_features': ['sqrt']
    ##'max_samples' : ['None', 0.5],
    ##'max_depth':[3,5,10],
    ## 'bootstrap':[True,False]
    }
clf = RandomForestClassifier(random_state=RAND_STATE)

In [50]:
clf = RandomForestClassifier(random_state=RAND_STATE)

In [52]:
grid_search = GridSearchCV(clf, param_grid, cv=5,return_train_score=True,n_jobs=-1,)

In [53]:
grid_search.fit(X_train,y_train.values.ravel())

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=1230),
             n_jobs=-1,
             param_grid={'max_features': ['sqrt'], 'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 4],
                         'n_estimators': [50, 100, 500]},
             return_train_score=True)

In [54]:
best_params = grid_search.best_params_ #To check the best set of parameters returned
best_params

{'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 100}

In [55]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.828398,0.016957,0.042079,0.001963,sqrt,1,2,50,"{'max_features': 'sqrt', 'min_samples_leaf': 1...",0.767372,0.762538,0.767836,0.767231,0.744256,0.761846,0.009003,11,0.992897,0.992595,0.991992,0.991992,0.992898,0.992475,0.00041
1,1.985337,0.311664,0.11267,0.016906,sqrt,1,2,100,"{'max_features': 'sqrt', 'min_samples_leaf': 1...",0.769184,0.762538,0.775695,0.763603,0.75393,0.76499,0.007245,7,0.99335,0.993048,0.992747,0.992747,0.993351,0.993049,0.00027
2,12.255352,0.385681,0.539477,0.029024,sqrt,1,2,500,"{'max_features': 'sqrt', 'min_samples_leaf': 1...",0.770393,0.76435,0.773881,0.76179,0.754534,0.76499,0.006756,8,0.99335,0.993048,0.992747,0.992747,0.993503,0.993079,0.000308
3,1.17112,0.068295,0.061403,0.00972,sqrt,1,4,50,"{'max_features': 'sqrt', 'min_samples_leaf': 1...",0.77281,0.755891,0.76844,0.754534,0.755139,0.761363,0.0077,12,0.977633,0.980203,0.978997,0.980508,0.979299,0.979328,0.001014
4,2.381772,0.077427,0.114313,0.008099,sqrt,1,4,100,"{'max_features': 'sqrt', 'min_samples_leaf': 1...",0.77281,0.762538,0.770859,0.758162,0.75393,0.76366,0.007235,10,0.981865,0.98126,0.981565,0.982472,0.982623,0.981957,0.000521
5,11.797309,0.587334,0.543153,0.058237,sqrt,1,4,500,"{'max_features': 'sqrt', 'min_samples_leaf': 1...",0.774018,0.761934,0.774486,0.758162,0.75393,0.764506,0.008352,9,0.985794,0.985945,0.984739,0.98625,0.986703,0.985886,0.000652
6,1.134703,0.058434,0.06032,0.015847,sqrt,2,2,50,"{'max_features': 'sqrt', 'min_samples_leaf': 2...",0.778248,0.766163,0.777509,0.76179,0.762999,0.769342,0.007119,5,0.949071,0.944839,0.945905,0.947567,0.949532,0.947383,0.001797
7,2.118147,0.221042,0.106634,0.01053,sqrt,2,2,100,"{'max_features': 'sqrt', 'min_samples_leaf': 2...",0.782477,0.770997,0.776904,0.763603,0.766022,0.772001,0.006944,1,0.951186,0.946048,0.948172,0.948474,0.949985,0.948773,0.001742
8,11.911788,0.587064,0.560589,0.116638,sqrt,2,2,500,"{'max_features': 'sqrt', 'min_samples_leaf': 2...",0.779456,0.766163,0.778718,0.766626,0.762394,0.770672,0.00703,3,0.950431,0.949524,0.950287,0.951496,0.950892,0.950526,0.000655
9,1.248974,0.174152,0.070482,0.006989,sqrt,2,4,50,"{'max_features': 'sqrt', 'min_samples_leaf': 2...",0.778248,0.766163,0.777509,0.76179,0.762999,0.769342,0.007119,5,0.949071,0.944839,0.945905,0.947567,0.949532,0.947383,0.001797


In [56]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(random_state=RAND_STATE, **best_params)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=5)
print(np.mean(cross_val_scores))

0.7720007890785681


In [57]:
clf.fit(X_train, y_train)

RandomForestClassifier(max_features='sqrt', min_samples_leaf=2,
                       random_state=1230)

In [58]:
len(X_train.columns)

4

In [59]:
feature_names = X_train.columns
feature_names = list(feature_names)

In [60]:
df = pd.DataFrame(list(zip(feature_names, clf.feature_importances_)))
df.columns = ['columns_name', 'score_feature_importance']
df.sort_values(by=['score_feature_importance'], ascending = False)

Unnamed: 0,columns_name,score_feature_importance
2,MonthlyCharges,0.382199
3,TotalCharges,0.34073
1,tenure,0.260052
0,SeniorCitizen,0.017018
