In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_selection import SelectKBest, chi2, RFE

%matplotlib inline

In [2]:
iris = sns.load_dataset('iris')

In [3]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [5]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [6]:
iris['species'].value_counts()

versicolor    50
virginica     50
setosa        50
Name: species, dtype: int64

In [7]:
iris.corr()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,1.0,-0.11757,0.871754,0.817941
sepal_width,-0.11757,1.0,-0.42844,-0.366126
petal_length,0.871754,-0.42844,1.0,0.962865
petal_width,0.817941,-0.366126,0.962865,1.0


In [8]:
X = iris.drop('species', axis=1)
y = iris['species']

#### USING K_BEST

In [9]:
k_selector = SelectKBest(chi2, k=4)
k_selector.fit(X, y)

SelectKBest(k=4, score_func=<function chi2 at 0x000001B5A1D8C790>)

In [10]:
k_selector.get_support(indices=True)

array([0, 1, 2, 3], dtype=int64)

In [11]:
X1 = iris[X.columns[k_selector.get_support()]]
X1

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


#### Using RFE

In [12]:
r_selector = RFE(RandomForestClassifier())
r_selector.fit(X, y)

RFE(estimator=RandomForestClassifier())

In [13]:
r_selector.support_

array([False, False,  True,  True])

In [14]:
r_selector.ranking_

array([2, 3, 1, 1])

In [15]:
# X.columns[r_selector.support_]

In [16]:
X2 = iris[X.columns[r_selector.support_]]

#### Using train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)
X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.3, random_state=3)
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=0.3, random_state=3)

In [18]:
def Model(X_train, y_train, X_test):
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred

In [19]:
a_predictions = Model(X_train, y_train, X_test)
print(classification_report(a_predictions, y_test))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        17
  versicolor       0.86      1.00      0.92        12
   virginica       1.00      0.88      0.93        16

    accuracy                           0.96        45
   macro avg       0.95      0.96      0.95        45
weighted avg       0.96      0.96      0.96        45



In [20]:
b_predictions = Model(X1_train, y_train, X1_test)
print(classification_report(b_predictions, y_test))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        17
  versicolor       0.93      0.93      0.93        14
   virginica       0.93      0.93      0.93        14

    accuracy                           0.96        45
   macro avg       0.95      0.95      0.95        45
weighted avg       0.96      0.96      0.96        45



In [21]:
c_predictions = Model(X2_train, y_train, X2_test)
print(classification_report(c_predictions, y_test))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        17
  versicolor       0.93      1.00      0.96        13
   virginica       1.00      0.93      0.97        15

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



### USING GRIDSEARCHCV

In [22]:
np.random.randint(2, 10, 5)

array([4, 4, 5, 2, 6])

In [23]:
param_grid = {'n_estimators':np.random.randint(10, 100, 5),
    'criterion':['gini', 'entropy'],
    'min_samples_split':[2, 3, 4],
    'min_samples_leaf':[1, 2, 3, 4],
    'max_features':[2, 3] }

In [24]:
grid = GridSearchCV(RandomForestClassifier(), param_grid, verbose=3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
[CV 1/5] END criterion=gini, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=36;, score=0.905 total time=   0.0s
[CV 2/5] END criterion=gini, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=36;, score=0.952 total time=   0.0s
[CV 3/5] END criterion=gini, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=36;, score=0.952 total time=   0.0s
[CV 4/5] END criterion=gini, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=36;, score=1.000 total time=   0.0s
[CV 5/5] END criterion=gini, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=36;, score=0.857 total time=   0.0s
[CV 1/5] END criterion=gini, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=40;, score=0.905 total time=   0.0s
[CV 2/5] END criterion=gini, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=40;, score=0.952 total time

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': [2, 3],
                         'min_samples_leaf': [1, 2, 3, 4],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': array([36, 40, 60, 14, 65])},
             verbose=3)

In [25]:
grid.best_params_

{'criterion': 'gini',
 'max_features': 2,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 40}

In [26]:
grid.best_estimator_

RandomForestClassifier(max_features=2, min_samples_leaf=2, n_estimators=40)

In [27]:
newpred = grid.predict(X_test)

In [28]:
print(classification_report(newpred, y_test))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        17
  versicolor       0.93      1.00      0.96        13
   virginica       1.00      0.93      0.97        15

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

