In [74]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

from sklearn.ensemble import RandomForestClassifier
import pickle

In [75]:
df_original  = pd.read_csv('Churn_Modelling.csv')
df_original.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [76]:
churn_df = df_original.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'Gender'])
churn_df.head()

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,41,1,83807.86,1,0,1,112542.58,0
2,502,France,42,8,159660.8,3,1,0,113931.57,1
3,699,France,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,43,2,125510.82,1,1,1,79084.1,0


In [77]:
churn_df2 = pd.get_dummies(churn_df, columns=['Geography'], drop_first=True, dtype=int)
churn_df2.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain
0,619,42,2,0.0,1,1,1,101348.88,1,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1


In [78]:
X = churn_df2.drop('Exited', axis=1)
y= churn_df2['Exited']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25 , stratify=y, random_state=42)

In [79]:
%%time

rf = RandomForestClassifier(random_state=0)
cv_params = {'max_depth':[2,3,4,5, None],
               'min_samples_leaf':[1,2,3],
               'min_samples_split':[1,2,3],
               'max_features': [2,3,4],
               'n_estimators': [75, 100, 125, 150]} 

scoring = {'accuracy', 'precision', 'recall', 'f1'}

rf_cv = GridSearchCV(rf ,param_grid=cv_params, scoring=scoring ,cv=5, refit='f1')

rf_cv.fit(X_train, y_train)


900 fits failed out of a total of 2700.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
900 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\User\anaconda3.1\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\User\anaconda3.1\Lib\site-packages\sklearn\ensemble\_forest.py", line 340, in fit
    self._validate_params()
  File "c:\Users\User\anaconda3.1\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\User\anaconda3.1\Lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_constraints
    raise Invali

CPU times: total: 1min 57s
Wall time: 13min 23s


In [80]:
path = r'C:\To lenovo\Baramej_Courses\coursera\02 Google Advanced Data Analytics\Activities'

In [81]:
# Pickle the model
with open(path + 'rf_cv_model_p.pickle', 'wb') as to_write:
    pickle.dump(rf_cv, to_write)

In [82]:
# Open pickled model
with open(path + 'rf_cv_model_p.pickle', 'rb') as to_read:
    rf_cv = pickle.load(to_read)

In [83]:
rf_cv.fit(X_train, y_train)
rf_cv.best_params_

900 fits failed out of a total of 2700.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
900 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\User\anaconda3.1\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\User\anaconda3.1\Lib\site-packages\sklearn\ensemble\_forest.py", line 340, in fit
    self._validate_params()
  File "c:\Users\User\anaconda3.1\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\User\anaconda3.1\Lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_constraints
    raise Invali

{'max_depth': None,
 'max_features': 4,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 125}

In [84]:
rf_cv.best_score_

0.580528563620339

In [85]:
def make_results(model_name, model_object):

    cv_results = pd.DataFrame(model_object.cv_results_)

    best_estimator_results = cv_results.iloc[cv_results['mean_test_f1'].idxmax(), :]

    # Extract accuracy, precision, recall, and f1 score from that row.
    f1 = best_estimator_results.mean_test_f1
    recall = best_estimator_results.mean_test_recall
    precision = best_estimator_results.mean_test_precision
    accuracy = best_estimator_results.mean_test_accuracy

 
    # Create table of results
    table = pd.DataFrame({'Model': [model_name],
                          'F1': [f1],
                          'Recall': [recall],
                          'Precision': [precision],
                          'Accuracy': [accuracy]
                         }
                        )

    return table


In [86]:
rf_cv_results = make_results('Random Forest CV', rf_cv)
rf_cv_results

Unnamed: 0,Model,F1,Recall,Precision,Accuracy
0,Random Forest CV,0.580529,0.472517,0.756289,0.861333


In [None]:
rf_cv_results.to_csv(path+'results1.csv', index=False)

In [87]:
results = pd.read_csv('results1.csv', index_col=0)
results

Unnamed: 0,Model,F1,Recall,Precision,Accuracy
0,Tuned Decision Tree,0.560655,0.469255,0.701608,0.8504


In [88]:
results = pd.concat([rf_cv_results, results], axis=0)

In [89]:
results

Unnamed: 0,Model,F1,Recall,Precision,Accuracy
0,Random Forest CV,0.580529,0.472517,0.756289,0.861333
0,Tuned Decision Tree,0.560655,0.469255,0.701608,0.8504


In [90]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.20, stratify=y_train, random_state=10)

In [91]:
split_index = [0 if x in X_val.index else -1 for x in X_train.index]

In [92]:
from sklearn.model_selection import PredefinedSplit

In [93]:
rf = RandomForestClassifier(random_state=0)
cv_params = {'max_depth':[2,3,4,5, None],
             'min_samples_leaf':[1,2,3],
             'min_samples_split':[2,3,4],
             'max_features':[2,3,4],
             'n_estimators':[75, 100, 125, 150]}
scoring = {'accuracy', 'precision', 'recall', 'f1'}
custom_split = PredefinedSplit(split_index)
rf_val = GridSearchCV(rf, param_grid=cv_params, scoring=scoring, cv = custom_split,refit='f1')

In [94]:
rf_val.fit(X_train, y_train)

In [95]:
with open (path+'rf_val_model.pickle', 'wb') as to_write:
    pickle.dump(rf_val, to_write)

In [96]:
with open(path+'rf_val_model.pickle', 'rb') as to_read:
    rf_val = pickle.load(to_read)

In [97]:
rf_val.best_params_

{'max_depth': None,
 'max_features': 4,
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 150}

In [98]:
rf_val_results = make_results('Random Forest Validated', rf_val)
results = pd.concat([rf_cv_results, results], axis=0)

results.sort_values(by=['F1'], ascending=False)

Unnamed: 0,Model,F1,Recall,Precision,Accuracy
0,Random Forest CV,0.580529,0.472517,0.756289,0.861333
0,Random Forest CV,0.580529,0.472517,0.756289,0.861333
0,Tuned Decision Tree,0.560655,0.469255,0.701608,0.8504


In [99]:
results.to_csv(path+'results2.csv', index=False)