# Importing required Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.base import clone
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')


# Feature selection and Modelling

In [2]:
#reading the data 
iris = pd.read_csv("/kaggle/input/iris/Iris.csv")
iris.shape

(150, 6)

In [3]:
X = iris.drop("Species",axis=1)

# encoding 'Species' columns

oe = OrdinalEncoder()

iris['Species'] = oe.fit_transform(iris['Species'].values.reshape(-1,1))
y = iris['Species']


In [4]:
iris.sample(5)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
100,101,6.3,3.3,6.0,2.5,2.0
136,137,6.3,3.4,5.6,2.4,2.0
43,44,5.0,3.5,1.6,0.6,0.0
92,93,5.8,2.6,4.0,1.2,1.0
9,10,4.9,3.1,1.5,0.1,0.0


# Applying Cross Validation

In [5]:
#making stratisfied kfold object
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

for train_idx, test_idx in kf.split(X,y):
    X_train, X_test, y_train, y_test = X.iloc[train_idx], X.iloc[test_idx], y.iloc[train_idx], y.iloc[test_idx]

In [6]:
# making a functions for various scores

def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train,y_train)
    
    y_test_pred = model.predict(X_test)
   
    test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    
    test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
    
    test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    
    return test_accuracy, test_mcc, test_f1

In [7]:
#intial code for giving us result of kfold cv

test_ac_sc = []
test_mcc_sc = []
test_f1_sc = []

scores = []

for train_idx, test_idx in kf.split(X,y):
    X_train, X_test, y_train, y_test = X.iloc[train_idx], X.iloc[test_idx], y.iloc[train_idx], y.iloc[test_idx]
    scores.append(get_score(SVC(), X_train, X_test, y_train, y_test))
    test_ac_sc.append(get_score(SVC(), X_train, X_test, y_train, y_test)[0])
    test_mcc_sc.append(get_score(SVC(), X_train, X_test, y_train, y_test)[1])
    test_f1_sc.append(get_score(SVC(), X_train, X_test, y_train, y_test)[2])
    

In [8]:
#scores
#test_ac_sc
#test_mcc_sc
#test_f1_sc

In [9]:
#making a list of models so as to loop the get_score functions
models = [SVC(), KNeighborsClassifier(), DecisionTreeClassifier(), MLPClassifier(),
          RandomForestClassifier() , LogisticRegression(), GradientBoostingClassifier()]

# Scores of all models present in the models list

In [10]:

for model in models:
    test_ac_sc = []
    test_mcc_sc = []
    test_f1_sc = []

    #scores = []

    for train_idx, test_idx in kf.split(X,y):
        X_train, X_test, y_train, y_test = X.iloc[train_idx], X.iloc[test_idx], y.iloc[train_idx], y.iloc[test_idx]
        #scores.append(get_score(model, X_train, X_test, y_train, y_test))
        test_ac_sc.append(get_score(model, X_train, X_test, y_train, y_test)[0])
        test_mcc_sc.append(get_score(model, X_train, X_test, y_train, y_test)[1])
        test_f1_sc.append(get_score(model, X_train, X_test, y_train, y_test)[2])


    print(f'for model: {model}')
    print('accuracy score :',np.mean(test_ac_sc))
    print('mcc score :',np.mean(test_mcc_sc))
    print('f1 score :',np.mean(test_f1_sc))
    print('\n')

for model: SVC()
accuracy score : 0.9933333333333334
mcc score : 0.9903174605388407
f1 score : 0.9933166248955722


for model: KNeighborsClassifier()
accuracy score : 1.0
mcc score : 1.0
f1 score : 1.0


for model: DecisionTreeClassifier()
accuracy score : 0.9933333333333334
mcc score : 0.9903174605388407
f1 score : 0.9933166248955722


for model: MLPClassifier()
accuracy score : 0.5666666666666667
mcc score : 0.5245493734839588
f1 score : 0.6990296739549668


for model: RandomForestClassifier()
accuracy score : 1.0
mcc score : 1.0
f1 score : 0.9933166248955722


for model: LogisticRegression()
accuracy score : 0.9866666666666667
mcc score : 0.9812121349077867
f1 score : 0.9865319865319865


for model: GradientBoostingClassifier()
accuracy score : 0.9933333333333334
mcc score : 0.9903174605388407
f1 score : 0.9933166248955722




In [11]:
estimators = [
    ('lr', LogisticRegression()),
    ('knn', KNeighborsClassifier()),
    ('gbdt',GradientBoostingClassifier()),
    ('MLP',MLPClassifier()),
    ('DTC', DecisionTreeClassifier())
    
]

In [12]:
clf = StackingClassifier(
    estimators=estimators, 
    final_estimator=RandomForestClassifier(),
    cv=10
)

# Making simple train test split

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=8,stratify=y)

In [14]:
param_grid = {
    'n_estimators': [25,50, 100, 150],
    'max_depth': [2, 4, 6],
    'min_samples_split': [2, 4, 6]
}

In [15]:
grid_search = GridSearchCV(estimator=clf.final_estimator, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [2, 4, 6], 'min_samples_split': [2, 4, 6],
                         'n_estimators': [25, 50, 100, 150]})

In [16]:
best_params = grid_search.best_params_
best_params


{'max_depth': 2, 'min_samples_split': 2, 'n_estimators': 25}

In [17]:
clf.final_estimator.set_params(**best_params)
clf.fit(X_train, y_train)

StackingClassifier(cv=10,
                   estimators=[('lr', LogisticRegression()),
                               ('knn', KNeighborsClassifier()),
                               ('gbdt', GradientBoostingClassifier()),
                               ('MLP', MLPClassifier()),
                               ('DTC', DecisionTreeClassifier())],
                   final_estimator=RandomForestClassifier(max_depth=2,
                                                          n_estimators=25))

In [18]:
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

1.0

In [19]:
confusion_matrix(y_test,y_pred)

array([[10,  0,  0],
       [ 0, 10,  0],
       [ 0,  0, 10]])


I made this notebook to practise my skills making a stacking classifier on a small dataset before I use it on my 'titanic competition'
notebook. I'm still a begineer in coding this was the best method I could come up with at the moment. A upvote/ like is highly appreciated if you like what I did here ...regards TM Kartikey