In [1]:
import pandas as pd
import numpy as np
import openml
from sklearn.preprocessing import MinMaxScaler,LabelEncoder 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.model_selection import KFold,GridSearchCV
from sklearn.metrics import f1_score, make_scorer
import time
from xgboost import XGBClassifier

# Classification for Iris Dataset

In [2]:
iris = openml.datasets.get_dataset("iris")
iris_df, iris_label, categorical_indicator, attribute_names = iris.get_data(
    target=iris.default_target_attribute, dataset_format="dataframe"
)
iris_df["class"]=iris_label
iris_x=iris_df.iloc[:,:4]
iris_df

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   sepallength  150 non-null    float64 
 1   sepalwidth   150 non-null    float64 
 2   petallength  150 non-null    float64 
 3   petalwidth   150 non-null    float64 
 4   class        150 non-null    category
dtypes: category(1), float64(4)
memory usage: 5.1 KB


In [4]:
iris_df["class"].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: class, dtype: int64

In [5]:
le=LabelEncoder()

iris_y=le.fit_transform(iris_label)

In [6]:
scaler = MinMaxScaler()

iris_x_scaled=scaler.fit_transform(iris_x)

In [7]:
def clasification(model,parameters,x,y):
    
    classification_df=pd.DataFrame({"Model":[],
                                    "Random State":[],
                                    "Best Parameters":[],
                                    "f1_Score":[],
                                    "Execution Time":[]})
    
    for i in range(1,11):
        start_time = time.time()
        cv_inner = KFold(n_splits=5, shuffle=True, random_state=i)
        score = ['accuracy' ,'f1_macro']
        grid_search = GridSearchCV(model, parameters, cv=cv_inner, scoring=score, refit='accuracy', n_jobs = -1)
        grid_result = grid_search.fit(x,y)
        y_predict = grid_result.predict(x)
        f1score = f1_score(y,y_predict, average="weighted")
        execution_time = time.time() - start_time
        classification_df=classification_df.append({"Model":model,
                                                    "Random State":i,
                                                    "Best Parameters":grid_result.best_params_,
                                                    "f1_Score":f1score,
                                                    "Execution Time":execution_time},ignore_index=True)
         
       
    if model == knc:
        classification_df.to_csv("Iris_KNC.csv")
    elif model == lda:
        classification_df.to_csv("Iris_LDA.csv")
    elif model == gnb:
        classification_df.to_csv("Iris_GNB.csv")
    elif model == svc:
        classification_df.to_csv("Iris_SVC.csv")
    elif model == lr:
        classification_df.to_csv("Iris_LR.csv")
    elif model == rfc:
        classification_df.to_csv("Iris_RFC.csv")
    elif model== abc:
        classification_df.to_csv("Iris_ABC.csv")
    elif model== gbc:
        classification_df.to_csv("Iris_GBC.csv")
    elif model == xgbc:
        classification_df.to_csv("Iris_XGBC.csv")


    return classification_df
    

## K-nearest neighbour classification

In [8]:
knc = KNeighborsClassifier()

knc_grid = {"n_neighbors":range(1, 21),
           "weights":['uniform', 'distance'],
           "metric":['manhattan','euclidean', 'minkowski']}

In [9]:
clasification(knc,knc_grid,iris_x_scaled,iris_y)

Unnamed: 0,Model,Random State,Best Parameters,f1_Score,Execution Time
0,KNeighborsClassifier(),1.0,"{'metric': 'euclidean', 'n_neighbors': 9, 'wei...",0.973323,4.467547
1,KNeighborsClassifier(),2.0,"{'metric': 'euclidean', 'n_neighbors': 6, 'wei...",0.966637,1.158886
2,KNeighborsClassifier(),3.0,"{'metric': 'euclidean', 'n_neighbors': 6, 'wei...",0.966637,1.107149
3,KNeighborsClassifier(),4.0,"{'metric': 'euclidean', 'n_neighbors': 14, 'we...",0.966637,0.954774
4,KNeighborsClassifier(),5.0,"{'metric': 'euclidean', 'n_neighbors': 14, 'we...",0.966637,0.838695
5,KNeighborsClassifier(),6.0,"{'metric': 'euclidean', 'n_neighbors': 5, 'wei...",0.959984,0.860337
6,KNeighborsClassifier(),7.0,"{'metric': 'euclidean', 'n_neighbors': 2, 'wei...",0.973291,0.892085
7,KNeighborsClassifier(),8.0,"{'metric': 'euclidean', 'n_neighbors': 6, 'wei...",0.966637,0.816549
8,KNeighborsClassifier(),9.0,"{'metric': 'manhattan', 'n_neighbors': 18, 'we...",0.953329,0.856979
9,KNeighborsClassifier(),10.0,"{'metric': 'euclidean', 'n_neighbors': 6, 'wei...",0.966637,0.864744


## Linear discriminant analysis 

In [10]:
lda = LinearDiscriminantAnalysis()

lda_grid = {"n_components": list(range(1,3,1)),
           "solver":['lsqr','eigen','svd']}

In [11]:
clasification(lda,lda_grid,iris_x_scaled,iris_y)

Unnamed: 0,Model,Random State,Best Parameters,f1_Score,Execution Time
0,LinearDiscriminantAnalysis(),1.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.114132
1,LinearDiscriminantAnalysis(),2.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.050405
2,LinearDiscriminantAnalysis(),3.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.058426
3,LinearDiscriminantAnalysis(),4.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.057971
4,LinearDiscriminantAnalysis(),5.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.060445
5,LinearDiscriminantAnalysis(),6.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.060438
6,LinearDiscriminantAnalysis(),7.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.058409
7,LinearDiscriminantAnalysis(),8.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.050335
8,LinearDiscriminantAnalysis(),9.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.052348
9,LinearDiscriminantAnalysis(),10.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.050375


## Naïve Bayes classifier

In [12]:
gnb = GaussianNB()

gnb_grid={"var_smoothing":np.logspace(0,-9, num=100)}

In [13]:
clasification(gnb,gnb_grid,iris_x_scaled,iris_y)

Unnamed: 0,Model,Random State,Best Parameters,f1_Score,Execution Time
0,GaussianNB(),1.0,{'var_smoothing': 0.04328761281083057},0.953329,0.55901
1,GaussianNB(),2.0,{'var_smoothing': 0.0657933224657568},0.953329,0.515977
2,GaussianNB(),3.0,{'var_smoothing': 0.02848035868435802},0.953329,0.69294
3,GaussianNB(),4.0,{'var_smoothing': 0.2848035868435802},0.953329,0.535378
4,GaussianNB(),5.0,{'var_smoothing': 0.2848035868435802},0.953329,0.620539
5,GaussianNB(),6.0,{'var_smoothing': 0.23101297000831597},0.953329,0.530471
6,GaussianNB(),7.0,{'var_smoothing': 0.15199110829529336},0.953329,0.572026
7,GaussianNB(),8.0,{'var_smoothing': 0.1873817422860384},0.953329,0.545545
8,GaussianNB(),9.0,{'var_smoothing': 0.1873817422860384},0.953329,0.613359
9,GaussianNB(),10.0,{'var_smoothing': 0.1873817422860384},0.953329,0.629319


## Support vector machine

In [14]:
svc=SVC()

svc_grid={"C":[0.01, 0.1, 1, 10, 100, 1000],
          "kernel":["linear","poly","rbf","sigmoid"],
          "gamma":[1, 0.1, 0.01, 0.001, 0.0001]}


In [15]:
clasification(svc,svc_grid,iris_x_scaled,iris_y)

Unnamed: 0,Model,Random State,Best Parameters,f1_Score,Execution Time
0,SVC(),1.0,"{'C': 10, 'gamma': 1, 'kernel': 'rbf'}",0.966583,0.804848
1,SVC(),2.0,"{'C': 1000, 'gamma': 1, 'kernel': 'linear'}",0.979998,0.829783
2,SVC(),3.0,"{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}",0.966583,0.819808
3,SVC(),4.0,"{'C': 100, 'gamma': 0.1, 'kernel': 'sigmoid'}",0.973323,0.834768
4,SVC(),5.0,"{'C': 100, 'gamma': 1, 'kernel': 'poly'}",0.979998,0.83377
5,SVC(),6.0,"{'C': 10, 'gamma': 1, 'kernel': 'linear'}",0.973323,0.81083
6,SVC(),7.0,"{'C': 10, 'gamma': 1, 'kernel': 'linear'}",0.973323,0.747003
7,SVC(),8.0,"{'C': 100, 'gamma': 1, 'kernel': 'linear'}",0.973323,0.74401
8,SVC(),9.0,"{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}",0.973323,0.769943
9,SVC(),10.0,"{'C': 1000, 'gamma': 1, 'kernel': 'linear'}",0.979998,0.768939


## Logistic regression

In [16]:
lr=LogisticRegression()

lr_grid = {"solver": ['newton-cg', 'lbfgs', 'liblinear'],
           "penalty": ['l2'],
           "C":[100, 10, 1.0, 0.1, 0.01]}

In [17]:
clasification(lr,lr_grid,iris_x_scaled,iris_y)

Unnamed: 0,Model,Random State,Best Parameters,f1_Score,Execution Time
0,LogisticRegression(),1.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.406911
1,LogisticRegression(),2.0,"{'C': 100, 'penalty': 'l2', 'solver': 'libline...",0.96,0.304187
2,LogisticRegression(),3.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.314159
3,LogisticRegression(),4.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.387958
4,LogisticRegression(),5.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.312166
5,LogisticRegression(),6.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.284241
6,LogisticRegression(),7.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.318152
7,LogisticRegression(),8.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.296205
8,LogisticRegression(),9.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.297204
9,LogisticRegression(),10.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.381984


## Random forests

In [18]:
rfc=RandomForestClassifier()

rfc_grid={'criterion' :['gini', 'entropy'],
         'max_depth' : [4,6,8],
         'n_estimators': [100,200,300,400,500],
         "n_jobs":[-1]}

In [19]:
clasification(rfc,rfc_grid,iris_x_scaled,iris_y)

Unnamed: 0,Model,Random State,Best Parameters,f1_Score,Execution Time
0,RandomForestClassifier(),1.0,"{'criterion': 'gini', 'max_depth': 4, 'n_estim...",0.986661,44.521001
1,RandomForestClassifier(),2.0,"{'criterion': 'gini', 'max_depth': 4, 'n_estim...",0.986661,52.65372
2,RandomForestClassifier(),3.0,"{'criterion': 'gini', 'max_depth': 4, 'n_estim...",0.986661,49.560306
3,RandomForestClassifier(),4.0,"{'criterion': 'gini', 'max_depth': 4, 'n_estim...",0.979982,48.040996
4,RandomForestClassifier(),5.0,"{'criterion': 'gini', 'max_depth': 4, 'n_estim...",0.986661,47.842802
5,RandomForestClassifier(),6.0,"{'criterion': 'gini', 'max_depth': 4, 'n_estim...",0.993333,56.782403
6,RandomForestClassifier(),7.0,"{'criterion': 'gini', 'max_depth': 4, 'n_estim...",0.993333,46.28999
7,RandomForestClassifier(),8.0,"{'criterion': 'gini', 'max_depth': 4, 'n_estim...",0.993333,47.121484
8,RandomForestClassifier(),9.0,"{'criterion': 'gini', 'max_depth': 4, 'n_estim...",0.993333,49.033904
9,RandomForestClassifier(),10.0,"{'criterion': 'gini', 'max_depth': 4, 'n_estim...",0.986661,45.011483


## Ada boost

In [20]:
abc=AdaBoostClassifier()

abc_grid = {"n_estimators":[10, 50, 100, 500, 1000],
           "learning_rate":np.arange(0.1, 2.1, 0.4)}

In [21]:
clasification(abc,abc_grid,iris_x_scaled,iris_y)

Unnamed: 0,Model,Random State,Best Parameters,f1_Score,Execution Time
0,AdaBoostClassifier(),1.0,"{'learning_rate': 1.7000000000000002, 'n_estim...",0.986667,38.721708
1,AdaBoostClassifier(),2.0,"{'learning_rate': 1.3000000000000003, 'n_estim...",0.959984,37.353718
2,AdaBoostClassifier(),3.0,"{'learning_rate': 1.7000000000000002, 'n_estim...",0.973323,30.615239
3,AdaBoostClassifier(),4.0,"{'learning_rate': 1.3000000000000003, 'n_estim...",0.959984,32.097293
4,AdaBoostClassifier(),5.0,"{'learning_rate': 1.3000000000000003, 'n_estim...",0.959984,35.910897
5,AdaBoostClassifier(),6.0,"{'learning_rate': 1.3000000000000003, 'n_estim...",0.973323,38.495155
6,AdaBoostClassifier(),7.0,"{'learning_rate': 1.3000000000000003, 'n_estim...",0.973323,37.813745
7,AdaBoostClassifier(),8.0,"{'learning_rate': 1.7000000000000002, 'n_estim...",0.959936,38.291101
8,AdaBoostClassifier(),9.0,"{'learning_rate': 1.3000000000000003, 'n_estim...",0.959984,31.299011
9,AdaBoostClassifier(),10.0,"{'learning_rate': 1.7000000000000002, 'n_estim...",0.979998,31.306025


## Gradiant boost

In [22]:
gbc = GradientBoostingClassifier()

gbc_grid = {"n_estimators":[10, 50, 100, 500, 1000],
            "max_depth":[1,3,5,7,9],
            "learning_rate":np.arange(0.1, 2.1, 0.4)}

In [23]:
clasification(gbc,gbc_grid,iris_x_scaled,iris_y)

Unnamed: 0,Model,Random State,Best Parameters,f1_Score,Execution Time
0,GradientBoostingClassifier(),1.0,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.993333,199.191411
1,GradientBoostingClassifier(),2.0,"{'learning_rate': 0.5, 'max_depth': 1, 'n_esti...",0.973323,196.28404
2,GradientBoostingClassifier(),3.0,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.993333,196.783363
3,GradientBoostingClassifier(),4.0,"{'learning_rate': 0.9, 'max_depth': 3, 'n_esti...",1.0,199.283029
4,GradientBoostingClassifier(),5.0,"{'learning_rate': 0.5, 'max_depth': 5, 'n_esti...",1.0,196.754784
5,GradientBoostingClassifier(),6.0,"{'learning_rate': 0.5, 'max_depth': 5, 'n_esti...",1.0,197.754108
6,GradientBoostingClassifier(),7.0,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",1.0,199.004768
7,GradientBoostingClassifier(),8.0,"{'learning_rate': 0.5, 'max_depth': 5, 'n_esti...",1.0,192.188587
8,GradientBoostingClassifier(),9.0,"{'learning_rate': 0.5, 'max_depth': 5, 'n_esti...",1.0,193.078651
9,GradientBoostingClassifier(),10.0,"{'learning_rate': 0.1, 'max_depth': 1, 'n_esti...",0.966663,189.245499


## XGBoost

In [24]:
xgbc = XGBClassifier()

xgbc_grid = {"max_depth":range(1,10,2),
            "min_child_weight":range(1,6,2),
            "learning_rate":np.arange(0.1, 2.1, 0.4),
            "n_estimators":[10, 50, 100, 500, 1000]}

In [25]:
clasification(xgbc,xgbc_grid,iris_x_scaled,iris_y)







































Unnamed: 0,Model,Random State,Best Parameters,f1_Score,Execution Time
0,"XGBClassifier(base_score=None, booster=None, c...",1.0,"{'learning_rate': 0.1, 'max_depth': 3, 'min_ch...",0.973323,351.31259
1,"XGBClassifier(base_score=None, booster=None, c...",2.0,"{'learning_rate': 0.1, 'max_depth': 1, 'min_ch...",0.966663,351.242561
2,"XGBClassifier(base_score=None, booster=None, c...",3.0,"{'learning_rate': 0.1, 'max_depth': 3, 'min_ch...",0.973323,353.618905
3,"XGBClassifier(base_score=None, booster=None, c...",4.0,"{'learning_rate': 0.1, 'max_depth': 1, 'min_ch...",0.966663,354.249708
4,"XGBClassifier(base_score=None, booster=None, c...",5.0,"{'learning_rate': 0.1, 'max_depth': 5, 'min_ch...",0.979998,352.113035
5,"XGBClassifier(base_score=None, booster=None, c...",6.0,"{'learning_rate': 0.1, 'max_depth': 5, 'min_ch...",0.979998,355.586275
6,"XGBClassifier(base_score=None, booster=None, c...",7.0,"{'learning_rate': 0.1, 'max_depth': 3, 'min_ch...",0.973323,354.816921
7,"XGBClassifier(base_score=None, booster=None, c...",8.0,"{'learning_rate': 0.1, 'max_depth': 1, 'min_ch...",0.966663,350.436352
8,"XGBClassifier(base_score=None, booster=None, c...",9.0,"{'learning_rate': 1.3000000000000003, 'max_dep...",0.986661,348.372797
9,"XGBClassifier(base_score=None, booster=None, c...",10.0,"{'learning_rate': 0.1, 'max_depth': 3, 'min_ch...",1.0,344.347314


In [26]:
# def clasification(model,parameters,x,y):
#     for i in range(1,10):
#         cv_inner = KFold(n_splits=5, shuffle=True, random_state=i)
#         score = ['accuracy' ,'f1_macro']

#         grid_search = GridSearchCV(model, parameters, scoring=score, refit="accuracy", error_score=0)
#         grid_result = grid_search.fit(x,y)
#         print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

#         params = grid_result.cv_results_['params']
#         f1_score = grid_result.cv_results_['mean_test_f1_macro']

#     knc_df=pd.DataFrame({"Parameters":[],
#                          "f1_score":[]})

#     for x,y in zip(params,f1_score):
#         knc_df=knc_df.append({"Parameters":x,
#                               "f1_score":y},ignore_index = True)
        
#     return knc_df
    

In [27]:
wine = openml.datasets.get_dataset("wine")
wine_df, wine_label, categorical_indicator, attribute_names = wine.get_data(
    target= wine.default_target_attribute, dataset_format="dataframe"
)
wine_df["class"]=wine_label
wine_x=wine_df.iloc[:,:13]
wine_df

Unnamed: 0,Alcohol,Malic_acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280%2FOD315_of_diluted_wines,Proline,class
0,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050,1
2,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740,3
174,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750,3
175,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835,3
176,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840,3
