In [1]:
# importing mandatory libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Reading the data set
df = pd.read_csv('diabetes.csv')

In [3]:
# View the dataset
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# Cheking the no of columns and the rows
df.shape

(768, 9)

In [5]:
# Viewing the data types of all the features
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [6]:
# Checking for null values
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
# I have abserved zero values in 'Glucose', 'Insulin' ,'SkinThickness'. There no meaning with zero values.
# hence replacing with the median values
df['Glucose'] = np.where(df['Glucose']==0,df['Glucose'].median(), df['Glucose'])
df['Insulin'] = np.where(df['Insulin']==0,df['Insulin'].median(), df['Insulin'])
df['SkinThickness'] = np.where(df['SkinThickness']==0,df['SkinThickness'].median(), df['SkinThickness'])

In [8]:
# Rechechecking the data set after replacing zero's with median values
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [9]:
# selecting dependendent and independent features
X = df.drop('Outcome',axis = 1)
y = df['Outcome'] 

In [10]:
# Viewing dependent and independent features
print(X.head())
print(y.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6    148.0             72           35.0     30.5  33.6   
1            1     85.0             66           29.0     30.5  26.6   
2            8    183.0             64           23.0     30.5  23.3   
3            1     89.0             66           23.0     94.0  28.1   
4            0    137.0             40           35.0    168.0  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     2.288   33  
0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64


In [11]:
# spliting the data for training the model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=33)

In [12]:
# Training the data with randomforest classifier 
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=10).fit(X_train,y_train)

In [13]:
# Evaluating the model
predictions = rf_classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
print(accuracy_score(y_test, predictions))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

0.7337662337662337
[[85 14]
 [27 28]]
              precision    recall  f1-score   support

           0       0.76      0.86      0.81        99
           1       0.67      0.51      0.58        55

    accuracy                           0.73       154
   macro avg       0.71      0.68      0.69       154
weighted avg       0.73      0.73      0.72       154



### training the model with randomized search cross validation

In [14]:
# Specifying the parameter distributions for randomized cross validation
from sklearn.model_selection import RandomizedSearchCV
# No. of trees in a random forest
n_estimators=[int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# no of features to consider at every split
max_features=['auto','sqrt', 'log2']
#maximum numer of levels in a tree
max_depth= [int(x) for x in np.linspace(10,1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [3,4,5,7,9]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [3, 4, 5, 7, 9], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [15]:
# training the model with randomized search
from sklearn.model_selection import RandomizedSearchCV
rf = RandomForestClassifier()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   38.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  4.1min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [16]:
# Viewing the best parameters with the randomized cross validation
rf_randomcv.best_params_

{'n_estimators': 800,
 'min_samples_split': 5,
 'min_samples_leaf': 8,
 'max_features': 'log2',
 'max_depth': 230,
 'criterion': 'gini'}

In [17]:
# Viewing the best estimators with the randomized cross validation
rf_randomcv.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=230, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=800,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [18]:
# assigning the best estimators of randomized cross validation values with to a variable
best_random_grid=rf_randomcv.best_estimator_

In [19]:
# evaluating the model with the best estimators of randomized cross validation
predictions = best_random_grid.predict(X_test)
print(accuracy_score(y_test, predictions))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

0.7402597402597403
[[86 13]
 [27 28]]
              precision    recall  f1-score   support

           0       0.76      0.87      0.81        99
           1       0.68      0.51      0.58        55

    accuracy                           0.74       154
   macro avg       0.72      0.69      0.70       154
weighted avg       0.73      0.74      0.73       154



### grid search cross validation 


here I am taking the best values from the randomizes cross validation

In [20]:
# viewing the best parameters of the randomized cross validation
rf_randomcv.best_params_

{'n_estimators': 800,
 'min_samples_split': 5,
 'min_samples_leaf': 8,
 'max_features': 'log2',
 'max_depth': 230,
 'criterion': 'gini'}

In [21]:
# Specifying the parameter distributions for grid-search cross validation
# values taking from the randomized search cross validation bese parameters.
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': [rf_randomcv.best_params_['criterion']],
    'max_depth': [rf_randomcv.best_params_['max_depth']],
    'max_features': [rf_randomcv.best_params_['max_features']],
    'min_samples_leaf': [rf_randomcv.best_params_['min_samples_leaf'], 
                         rf_randomcv.best_params_['min_samples_leaf']+2, 
                         rf_randomcv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rf_randomcv.best_params_['min_samples_split'] - 2,
                          rf_randomcv.best_params_['min_samples_split'] - 1,
                          rf_randomcv.best_params_['min_samples_split'], 
                          rf_randomcv.best_params_['min_samples_split'] +1,
                          rf_randomcv.best_params_['min_samples_split'] + 2],
    'n_estimators': [rf_randomcv.best_params_['n_estimators'] - 200, rf_randomcv.best_params_['n_estimators'] - 100, 
                     rf_randomcv.best_params_['n_estimators'], 
                     rf_randomcv.best_params_['n_estimators'] + 100, rf_randomcv.best_params_['n_estimators'] + 200]
}

print(param_grid)

{'criterion': ['gini'], 'max_depth': [230], 'max_features': ['log2'], 'min_samples_leaf': [8, 10, 12], 'min_samples_split': [3, 4, 5, 6, 7], 'n_estimators': [600, 700, 800, 900, 1000]}


In [22]:
# Checking how many no of itterationes done in the grid-search cross validation
1*1*1*3*5*5

75

In [23]:
#### Fit the grid_search to the data
rf=RandomForestClassifier()
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 750 out of 750 | elapsed:  7.5min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             ii

In [24]:
# Viewing the best parameters with the grid-search cross validation
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=230, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, n_estimators=600,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [25]:
# assigning the best estimators of grid-search cross validation values with to a variable
best_grid=grid_search.best_estimator_

In [26]:
# model performance evaluation with best estimators of grid-search cross validation values
y_pred=best_grid.predict(X_test)
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print(confusion_matrix(y_test,y_pred))
print(format(classification_report(y_test,y_pred)))

Accuracy Score 0.7532467532467533
[[86 13]
 [25 30]]
              precision    recall  f1-score   support

           0       0.77      0.87      0.82        99
           1       0.70      0.55      0.61        55

    accuracy                           0.75       154
   macro avg       0.74      0.71      0.72       154
weighted avg       0.75      0.75      0.75       154



### Automated hyperparameter tuning techinique called "bayesian optimization"

In [27]:
# hyperparameter tuning with bayesian optimization will be done with 'fmin' function
# we need to define space, 'objective', 'trails' for fmin function
# 1. defining the space
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

In [29]:
# 2. Defining the objective
def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    from sklearn.model_selection import cross_val_score
    accuracy = cross_val_score(model, X_train, y_train, cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [30]:
# defining the trail and finding the best parameteres using the 'fmin()-->bayesian optimization' 
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

100%|██████████| 80/80 [05:10<00:00,  3.88s/trial, best loss: -0.7769898492216016]


{'criterion': 1,
 'max_depth': 450.0,
 'max_features': 0,
 'min_samples_leaf': 0.0024165743352262733,
 'min_samples_split': 0.06845980032544075,
 'n_estimators': 2}

In [31]:
# un-packing the best parametres using the mapping
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200, 5:1300, 6:1500}

print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])

gini
auto
300


In [32]:
# training the model with hyperparameter bayesian optimization--randomforest classification 
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], max_depth = best['max_depth'], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = best['min_samples_leaf'], 
                                       min_samples_split = best['min_samples_split'], 
                                       n_estimators = est[best['n_estimators']]).fit(X_train,y_train)

In [33]:
# evaluating the model
predictionforest = trainedforest.predict(X_test)
print(accuracy_score(y_test,predictionforest))
print(confusion_matrix(y_test,predictionforest))
print(classification_report(y_test,predictionforest))
acc5 = accuracy_score(y_test,predictionforest)

0.7272727272727273
[[85 14]
 [28 27]]
              precision    recall  f1-score   support

           0       0.75      0.86      0.80        99
           1       0.66      0.49      0.56        55

    accuracy                           0.73       154
   macro avg       0.71      0.67      0.68       154
weighted avg       0.72      0.73      0.72       154



### Genetic algorithm

In [35]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(param)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [37]:
from tpot import TPOTClassifier
tpot_classifier = TPOTClassifier(generations= 5, population_size= 24, offspring_size= 12,
                                 verbosity= 2, early_stop= 12,
                                 config_dict={'sklearn.ensemble.RandomForestClassifier': param}, 
                                 cv = 4, scoring = 'accuracy')
tpot_classifier.fit(X_train,y_train)

HBox(children=(IntProgress(value=0, description='Optimization Progress', max=84, style=ProgressStyle(descripti…

Generation 1 - Current best internal CV score: 0.7833333333333332
Generation 2 - Current best internal CV score: 0.7833333333333332
Generation 3 - Current best internal CV score: 0.7833333333333332
Generation 4 - Current best internal CV score: 0.7833333333333332
Generation 5 - Current best internal CV score: 0.7833333333333332

Best pipeline: RandomForestClassifier(input_matrix, criterion=gini, max_depth=120, max_features=auto, min_samples_leaf=8, min_samples_split=10, n_estimators=2000)


TPOTClassifier(config_dict={'sklearn.ensemble.RandomForestClassifier': {'criterion': ['entropy',
                                                                                      'gini'],
                                                                        'max_depth': [10,
                                                                                      120,
                                                                                      230,
                                                                                      340,
                                                                                      450,
                                                                                      560,
                                                                                      670,
                                                                                      780,
                                                                                 

In [38]:
accuracy = tpot_classifier.score(X_test, y_test)
print(accuracy)

0.7532467532467533


### Optimize hyperparameters of the model using Optuna

In [40]:
import optuna
import sklearn.svm
def objective(trial):

    classifier = trial.suggest_categorical('classifier', ['RandomForest', 'SVC'])
    
    if classifier == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 200, 2000,10)
        max_depth = int(trial.suggest_float('max_depth', 10, 100, log=True))

        clf = sklearn.ensemble.RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth)
    else:
        c = trial.suggest_float('svc_c', 1e-10, 1e10, log=True)
        
        clf = sklearn.svm.SVC(C=c, gamma='auto')

    return sklearn.model_selection.cross_val_score(
        clf,X_train,y_train, n_jobs=-1, cv=3).mean()

In [41]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[I 2020-08-04 11:29:29,887] Trial 0 finished with value: 0.7639008448908019 and parameters: {'classifier': 'RandomForest', 'n_estimators': 730, 'max_depth': 96.8828954315088}. Best is trial 0 with value: 0.7639008448908019.
[I 2020-08-04 11:29:33,674] Trial 1 finished with value: 0.6530926191614858 and parameters: {'classifier': 'SVC', 'svc_c': 41871946.458197966}. Best is trial 0 with value: 0.7639008448908019.
[I 2020-08-04 11:29:33,750] Trial 2 finished with value: 0.6530926191614858 and parameters: {'classifier': 'SVC', 'svc_c': 1.3859485263431508e-06}. Best is trial 0 with value: 0.7639008448908019.
[I 2020-08-04 11:29:35,227] Trial 3 finished with value: 0.7606249003666506 and parameters: {'classifier': 'RandomForest', 'n_estimators': 600, 'max_depth': 30.8893139669508}. Best is trial 0 with value: 0.7639008448908019.
[I 2020-08-04 11:29:38,949] Trial 4 finished with value: 0.7671608480790691 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1630, 'max_depth': 64.227

[I 2020-08-04 11:31:03,201] Trial 37 finished with value: 0.772030926191615 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1080, 'max_depth': 10.138264173148395}. Best is trial 8 with value: 0.7785509325681493.
[I 2020-08-04 11:31:07,006] Trial 38 finished with value: 0.772038896859557 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1460, 'max_depth': 16.057816738521435}. Best is trial 8 with value: 0.7785509325681493.
[I 2020-08-04 11:31:10,929] Trial 39 finished with value: 0.7704208512673363 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1710, 'max_depth': 16.563250393694773}. Best is trial 8 with value: 0.7785509325681493.
[I 2020-08-04 11:31:10,986] Trial 40 finished with value: 0.6530926191614858 and parameters: {'classifier': 'SVC', 'svc_c': 2.601783461087175e-05}. Best is trial 8 with value: 0.7785509325681493.
[I 2020-08-04 11:31:14,754] Trial 41 finished with value: 0.7736728837876615 and parameters: {'classifier': 'RandomForest'

[I 2020-08-04 11:33:03,197] Trial 73 finished with value: 0.7736649131197195 and parameters: {'classifier': 'RandomForest', 'n_estimators': 900, 'max_depth': 12.249112467424556}. Best is trial 8 with value: 0.7785509325681493.
[I 2020-08-04 11:33:05,308] Trial 74 finished with value: 0.767144906743185 and parameters: {'classifier': 'RandomForest', 'n_estimators': 960, 'max_depth': 30.45796450056927}. Best is trial 8 with value: 0.7785509325681493.
[I 2020-08-04 11:33:07,192] Trial 75 finished with value: 0.7687709230033476 and parameters: {'classifier': 'RandomForest', 'n_estimators': 860, 'max_depth': 47.078868789325746}. Best is trial 8 with value: 0.7785509325681493.
[I 2020-08-04 11:33:08,968] Trial 76 finished with value: 0.7638928742228598 and parameters: {'classifier': 'RandomForest', 'n_estimators': 790, 'max_depth': 12.469467625860107}. Best is trial 8 with value: 0.7785509325681493.
[I 2020-08-04 11:33:09,039] Trial 77 finished with value: 0.6530926191614858 and parameters: {

Accuracy: 0.7785509325681493
Best hyperparameters: {'classifier': 'RandomForest', 'n_estimators': 1450, 'max_depth': 14.71493827877937}


In [42]:
study.best_params

{'classifier': 'RandomForest',
 'n_estimators': 1450,
 'max_depth': 14.71493827877937}

In [43]:
rf=RandomForestClassifier(n_estimators=330,max_depth=30)
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=30, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=330,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [44]:
y_pred=rf.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.7662337662337663
[[87 12]
 [24 31]]
              precision    recall  f1-score   support

           0       0.78      0.88      0.83        99
           1       0.72      0.56      0.63        55

    accuracy                           0.77       154
   macro avg       0.75      0.72      0.73       154
weighted avg       0.76      0.77      0.76       154

