**Import required libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Import data**

In [3]:
df = pd.read_csv("/content/drive/MyDrive/AIR/Study material/Lab/7. Random Forest/RF_dataset.csv")
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


**Drop the columns or rows containing null values**

In [4]:
df = df.dropna()
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

**Select input  and output features (Outcome)**

In [6]:
X = df.drop('Outcome',axis=1)
y = df['Outcome']

**Train|test split**

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

##**Random Forest Classification**

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
model = RandomForestClassifier(n_estimators=2)

In [11]:
model.fit(X_train,y_train)

In [12]:
preds = model.predict(X_test)

##**Evaluation**

In [13]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [14]:
confusion_matrix(y_test,preds)

array([[93, 10],
       [40, 11]])

In [15]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,preds))

              precision    recall  f1-score   support

           0       0.70      0.90      0.79       103
           1       0.52      0.22      0.31        51

    accuracy                           0.68       154
   macro avg       0.61      0.56      0.55       154
weighted avg       0.64      0.68      0.63       154



##**Use GridsearchCV for hyperparameter tuning**

In [16]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
params = {'max_depth': range(1,10),
    'min_samples_leaf': range(1,10),
    'n_estimators': range(1,50)}

In [17]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=rf,
                           param_grid=params,
                           cv = 4, verbose=3)

In [18]:
grid_search.fit(X_train, y_train)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[CV 1/4] END max_depth=7, min_samples_leaf=2, n_estimators=25;, score=0.734 total time=   0.1s
[CV 2/4] END max_depth=7, min_samples_leaf=2, n_estimators=25;, score=0.714 total time=   0.1s
[CV 3/4] END max_depth=7, min_samples_leaf=2, n_estimators=25;, score=0.797 total time=   0.1s
[CV 4/4] END max_depth=7, min_samples_leaf=2, n_estimators=25;, score=0.719 total time=   0.1s
[CV 1/4] END max_depth=7, min_samples_leaf=2, n_estimators=26;, score=0.727 total time=   0.1s
[CV 2/4] END max_depth=7, min_samples_leaf=2, n_estimators=26;, score=0.714 total time=   0.1s
[CV 3/4] END max_depth=7, min_samples_leaf=2, n_estimators=26;, score=0.797 total time=   0.1s
[CV 4/4] END max_depth=7, min_samples_leaf=2, n_estimators=26;, score=0.725 total time=   0.1s
[CV 1/4] END max_depth=7, min_samples_leaf=2, n_estimators=27;, score=0.721 total time=   0.1s
[CV 2/4] END max_depth=7, min_samples_leaf=2, n_estimators=27;, score=0.747 tota

In [19]:
rf_best = grid_search.best_estimator_
rf_best

In [20]:
rf_best_1 = grid_search.best_params_
rf_best_1

{'max_depth': 7, 'min_samples_leaf': 9, 'n_estimators': 16}

In [21]:
grid_search.best_score_

0.7753480179950768

In [22]:
Grid_model = RandomForestClassifier(max_depth=7, min_samples_leaf=9, n_estimators=16,n_jobs=-1, random_state=42)

In [23]:
Grid_model=Grid_model.fit(X_train,y_train)
Grid_model_preds = Grid_model.predict(X_test)

In [24]:
from sklearn.metrics import classification_report,confusion_matrix
confusion_matrix(y_test,Grid_model_preds)

array([[90, 13],
       [21, 30]])

In [25]:
print(classification_report(y_test,Grid_model_preds))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       103
           1       0.70      0.59      0.64        51

    accuracy                           0.78       154
   macro avg       0.75      0.73      0.74       154
weighted avg       0.77      0.78      0.77       154



##**Feature Importance**

Very useful attribute of the trained model!

In [26]:
rf_best.feature_importances_

array([0.09570358, 0.36471308, 0.03623413, 0.04180732, 0.0664312 ,
       0.18437267, 0.08325949, 0.12747853])

In [27]:
imp_df = pd.DataFrame({"Varname": X_train.columns,"Imp": rf_best.feature_importances_})

In [28]:
imp_df.sort_values(by="Imp", ascending=False)

Unnamed: 0,Varname,Imp
1,Glucose,0.364713
5,BMI,0.184373
7,Age,0.127479
0,Pregnancies,0.095704
6,DiabetesPedigreeFunction,0.083259
4,Insulin,0.066431
3,SkinThickness,0.041807
2,BloodPressure,0.036234


**TASK**

1. Use bagging Classifier with Support Vector Classifier as base estimator (weak Learner)

**Bagging Classifiers**

In [29]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

clf = BaggingClassifier(estimator=SVC(), n_estimators=10, random_state=0)
Bagg_class=clf.fit(X_train,y_train)

In [30]:
Bagg_preds = Bagg_class.predict(X_test)

In [31]:
from sklearn.metrics import classification_report,confusion_matrix
confusion_matrix(y_test,Bagg_preds)

array([[96,  7],
       [26, 25]])

In [32]:
print(classification_report(y_test,Bagg_preds))

              precision    recall  f1-score   support

           0       0.79      0.93      0.85       103
           1       0.78      0.49      0.60        51

    accuracy                           0.79       154
   macro avg       0.78      0.71      0.73       154
weighted avg       0.79      0.79      0.77       154

