## Model Creation and validation

In [None]:
scaler = RobustScaler() # creating the scaler object to scale the all the numerical feature

In [None]:
X = df.drop('Potability',axis = 1)
y = df['Potability']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42, shuffle = True, stratify = y)
print(f'shape of the train data : {X_train.shape}')
print(f'shape of the train target : {y_train.shape}')
print(f'shape of the test data : {X_test.shape}')
print(f'shape of the test target : {y_test.shape}')

shape of the train data : (2620, 9)
shape of the train target : (2620,)
shape of the test data : (656, 9)
shape of the test target : (656,)


In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Random Forest

#### Baseline Model

In [None]:
dt = DecisionTreeClassifier(random_state = 42)
dt.fit(X_train,y_train)
y_pred_rf = dt.predict(X_test)
rf_acc = dt.score(X_test,y_test)
print(f'Accuracy from Decision Tree : {rf_acc}')
print('Confustion Metrix : ')
print(confusion_matrix(y_test,y_pred_rf))
print('Classification Report')
print(classification_report(y_test,y_pred_rf))

Accuracy from Decision Tree : 0.6036585365853658
Confustion Metrix : 
[[282 118]
 [142 114]]
Classification Report
              precision    recall  f1-score   support

           0       0.67      0.70      0.68       400
           1       0.49      0.45      0.47       256

    accuracy                           0.60       656
   macro avg       0.58      0.58      0.58       656
weighted avg       0.60      0.60      0.60       656



In [None]:
rf = RandomForestClassifier(random_state = 42)
rf.fit(X_train,y_train)
y_pred_rf = rf.predict(X_test)
rf_acc = rf.score(X_test,y_test)
print(f'Accuracy from Random Forest Classifier : {rf_acc}')
print('Confustion Metrix : ')
print(confusion_matrix(y_test,y_pred_rf))
print('Classification Report')
print(classification_report(y_test,y_pred_rf))

Accuracy from Random Forest Classifier : 0.6554878048780488
Confustion Metrix : 
[[350  50]
 [176  80]]
Classification Report
              precision    recall  f1-score   support

           0       0.67      0.88      0.76       400
           1       0.62      0.31      0.41       256

    accuracy                           0.66       656
   macro avg       0.64      0.59      0.59       656
weighted avg       0.65      0.66      0.62       656



#### Hyper-parameter tuning

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 500, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 10, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [int(x) for x in np.linspace(2,20,5)]
# Minimum number of samples required at each leaf node
min_samples_leaf = [int(x) for x in np.linspace(2,10,5)]
# Method of selecting samples for training each tree
bootstrap = [True]
criterion = ['gini','entropy']

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion': criterion}

In [None]:
rf = RandomForestClassifier(random_state = 42)
rf_random_search = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 200, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random_search.fit(X_train,y_train)

print(f'Best Parameters using Randomized search : {rf_random_search.best_params_}')
print(f'Best Score using Randomized search : {rf_random_search.best_score_}')

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best Parameters using Randomized search : {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': None, 'criterion': 'gini', 'bootstrap': True}
Best Score using Randomized search : 0.6694656488549617


In [None]:
# using Gridsearch after RandomizedSearch to finetune the parameters
grid = {
    'n_estimators' : [rf_random_search.best_params_['n_estimators'],rf_random_search.best_params_['n_estimators']+50,rf_random_search.best_params_['n_estimators']+100],
    'min_samples_split' : [rf_random_search.best_params_['min_samples_split'],rf_random_search.best_params_['min_samples_split']-1,rf_random_search.best_params_['min_samples_split']+1],
    'min_samples_leaf' : [rf_random_search.best_params_['min_samples_leaf'],rf_random_search.best_params_['min_samples_leaf']-1,rf_random_search.best_params_['min_samples_leaf']+1],
    'max_depth' : [rf_random_search.best_params_['max_depth'],None],
    'max_features' : [rf_random_search.best_params_['max_features']],
    'criterion' : [rf_random_search.best_params_['criterion']],
    'bootstrap' : [rf_random_search.best_params_['bootstrap']]
}
rf_clf = RandomForestClassifier(random_state = 42)
rf_grid_search = GridSearchCV(rf_clf,grid,n_jobs = -1,cv=5,verbose = 3)
rf_grid_search.fit(X_train,y_train)

print(f'Best Parameters using Grid search : {rf_grid_search.best_params_}')
print(f'Best Score using Grid search : {rf_grid_search.best_score_}')

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters using Grid search : {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best Score using Grid search : 0.6744274809160304


In [None]:
y_pred_rf = rf_grid_search.predict(X_test)
rf_acc = rf_grid_search.score(X_test,y_test)
print(f'Accuracy from Random Forest after hyper parameter tuning : {rf_acc}')
print('Confustion Metrix : ')
print(confusion_matrix(y_test,y_pred_rf))
print('Classification Report')
print(classification_report(y_test,y_pred_rf))

Accuracy from Random Forest after hyper parameter tuning : 0.6570121951219512
Confustion Metrix : 
[[351  49]
 [176  80]]
Classification Report
              precision    recall  f1-score   support

           0       0.67      0.88      0.76       400
           1       0.62      0.31      0.42       256

    accuracy                           0.66       656
   macro avg       0.64      0.59      0.59       656
weighted avg       0.65      0.66      0.62       656

