In [1]:
import pandas as pd
import numpy as np
import openml
from sklearn.preprocessing import MinMaxScaler,LabelEncoder 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.model_selection import KFold,GridSearchCV
from sklearn.metrics import f1_score, make_scorer
import time
from xgboost import XGBClassifier

# Classification for Iris Dataset

In [2]:
iris = openml.datasets.get_dataset("iris")
iris_df, iris_label, categorical_indicator, attribute_names = iris.get_data(
    target=iris.default_target_attribute, dataset_format="dataframe"
)
iris_df["class"]=iris_label
iris_x=iris_df.iloc[:,:4]
iris_df

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   sepallength  150 non-null    float64 
 1   sepalwidth   150 non-null    float64 
 2   petallength  150 non-null    float64 
 3   petalwidth   150 non-null    float64 
 4   class        150 non-null    category
dtypes: category(1), float64(4)
memory usage: 5.1 KB


In [4]:
iris_df["class"].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: class, dtype: int64

In [5]:
le=LabelEncoder()

iris_y=le.fit_transform(iris_label)

In [6]:
scaler = MinMaxScaler()

iris_x_scaled=scaler.fit_transform(iris_x)

In [7]:
def clasification(model,parameters,x,y):
    
    classification_df=pd.DataFrame({"Model":[],
                                    "Random State":[],
                                    "Best Parameters":[],
                                    "f1_Score":[],
                                    "Execution Time":[]})
    
    for i in range(1,11):
        start_time = time.time()
        cv_inner = KFold(n_splits=5, shuffle=True, random_state=i)
        score = ['accuracy' ,'f1_macro']
        grid_search = GridSearchCV(model, parameters, cv=cv_inner, scoring=score, refit='accuracy', n_jobs = -1)
        grid_result = grid_search.fit(x,y)
        y_predict = grid_result.predict(x)
        f1score = f1_score(y,y_predict, average="weighted")
        execution_time = time.time() - start_time
        classification_df=classification_df.append({"Model":model,
                                                    "Random State":i,
                                                    "Best Parameters":grid_result.best_params_,
                                                    "f1_Score":f1score,
                                                    "Execution Time":execution_time},ignore_index=True)
         
       
    if model == knc:
        classification_df.to_csv("Iris_KNC.csv")
    elif model == lda:
        classification_df.to_csv("Iris_LDA.csv")
    elif model == gnb:
        classification_df.to_csv("Iris_GNB.csv")
    elif model == svc:
        classification_df.to_csv("Iris_SVC.csv")
    elif model == lr:
        classification_df.to_csv("Iris_LR.csv")
    elif model == rfc:
        classification_df.to_csv("Iris_RFC.csv")
    elif model== abc:
        classification_df.to_csv("Iris_ABC.csv")
    elif model== gbc:
        classification_df.to_csv("Iris_GBC.csv")
    elif model == xgbc:
        classification_df.to_csv("Iris_XGBC.csv")


    return classification_df
    

## K-nearest neighbour classification

In [8]:
knc = KNeighborsClassifier()

knc_grid = {"n_neighbors":range(1, 21),
           "weights":['uniform', 'distance'],
           "metric":['manhattan','euclidean', 'minkowski']}

In [9]:
clasification(knc,knc_grid,iris_x_scaled,iris_y)

Unnamed: 0,Model,Random State,Best Parameters,f1_Score,Execution Time
0,KNeighborsClassifier(),1.0,"{'metric': 'euclidean', 'n_neighbors': 9, 'wei...",0.973323,7.239606
1,KNeighborsClassifier(),2.0,"{'metric': 'euclidean', 'n_neighbors': 6, 'wei...",0.966637,1.281572
2,KNeighborsClassifier(),3.0,"{'metric': 'euclidean', 'n_neighbors': 6, 'wei...",0.966637,1.879474
3,KNeighborsClassifier(),4.0,"{'metric': 'euclidean', 'n_neighbors': 14, 'we...",0.966637,1.179172
4,KNeighborsClassifier(),5.0,"{'metric': 'euclidean', 'n_neighbors': 14, 'we...",0.966637,1.284752
5,KNeighborsClassifier(),6.0,"{'metric': 'euclidean', 'n_neighbors': 5, 'wei...",0.959984,1.306829
6,KNeighborsClassifier(),7.0,"{'metric': 'euclidean', 'n_neighbors': 2, 'wei...",0.973291,1.41245
7,KNeighborsClassifier(),8.0,"{'metric': 'euclidean', 'n_neighbors': 6, 'wei...",0.966637,1.440559
8,KNeighborsClassifier(),9.0,"{'metric': 'manhattan', 'n_neighbors': 18, 'we...",0.953329,1.098416
9,KNeighborsClassifier(),10.0,"{'metric': 'euclidean', 'n_neighbors': 6, 'wei...",0.966637,0.918267


## Linear discriminant analysis 

In [10]:
lda = LinearDiscriminantAnalysis()

lda_grid = {"n_components": list(range(1,3,1)),
           "solver":['lsqr','eigen','svd']}

In [11]:
clasification(lda,lda_grid,iris_x_scaled,iris_y)

Unnamed: 0,Model,Random State,Best Parameters,f1_Score,Execution Time
0,LinearDiscriminantAnalysis(),1.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.084542
1,LinearDiscriminantAnalysis(),2.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.10215
2,LinearDiscriminantAnalysis(),3.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.08278
3,LinearDiscriminantAnalysis(),4.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.066821
4,LinearDiscriminantAnalysis(),5.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.064826
5,LinearDiscriminantAnalysis(),6.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.062486
6,LinearDiscriminantAnalysis(),7.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.062488
7,LinearDiscriminantAnalysis(),8.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.062483
8,LinearDiscriminantAnalysis(),9.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.062486
9,LinearDiscriminantAnalysis(),10.0,"{'n_components': 1, 'solver': 'lsqr'}",0.979998,0.046867


## Naïve Bayes classifier

In [12]:
gnb = GaussianNB()

gnb_grid={"var_smoothing":np.logspace(0,-9, num=100)}

In [13]:
clasification(gnb,gnb_grid,iris_x_scaled,iris_y)

Unnamed: 0,Model,Random State,Best Parameters,f1_Score,Execution Time
0,GaussianNB(),1.0,{'var_smoothing': 0.04328761281083057},0.953329,0.574519
1,GaussianNB(),2.0,{'var_smoothing': 0.0657933224657568},0.953329,0.546748
2,GaussianNB(),3.0,{'var_smoothing': 0.02848035868435802},0.953329,0.671715
3,GaussianNB(),4.0,{'var_smoothing': 0.2848035868435802},0.953329,0.574909
4,GaussianNB(),5.0,{'var_smoothing': 0.2848035868435802},0.953329,0.577989
5,GaussianNB(),6.0,{'var_smoothing': 0.23101297000831597},0.953329,0.542749
6,GaussianNB(),7.0,{'var_smoothing': 0.15199110829529336},0.953329,0.586518
7,GaussianNB(),8.0,{'var_smoothing': 0.1873817422860384},0.953329,0.520804
8,GaussianNB(),9.0,{'var_smoothing': 0.1873817422860384},0.953329,0.546748
9,GaussianNB(),10.0,{'var_smoothing': 0.1873817422860384},0.953329,0.546746


## Support vector machine

In [14]:
svc=SVC()

svc_grid={"C":[0.01, 0.1, 1, 10, 100, 1000],
          "kernel":["linear","poly","rbf","sigmoid"],
          "gamma":[1, 0.1, 0.01, 0.001, 0.0001]}


In [15]:
clasification(svc,svc_grid,iris_x_scaled,iris_y)

Unnamed: 0,Model,Random State,Best Parameters,f1_Score,Execution Time
0,SVC(),1.0,"{'C': 10, 'gamma': 1, 'kernel': 'rbf'}",0.966583,0.772361
1,SVC(),2.0,"{'C': 1000, 'gamma': 1, 'kernel': 'linear'}",0.979998,0.795285
2,SVC(),3.0,"{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}",0.966583,0.883401
3,SVC(),4.0,"{'C': 100, 'gamma': 0.1, 'kernel': 'sigmoid'}",0.973323,0.850518
4,SVC(),5.0,"{'C': 100, 'gamma': 1, 'kernel': 'poly'}",0.979998,0.751438
5,SVC(),6.0,"{'C': 10, 'gamma': 1, 'kernel': 'linear'}",0.973323,0.923613
6,SVC(),7.0,"{'C': 10, 'gamma': 1, 'kernel': 'linear'}",0.973323,0.852626
7,SVC(),8.0,"{'C': 100, 'gamma': 1, 'kernel': 'linear'}",0.973323,0.992487
8,SVC(),9.0,"{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}",0.973323,1.009298
9,SVC(),10.0,"{'C': 1000, 'gamma': 1, 'kernel': 'linear'}",0.979998,0.980828


## Logistic regression

In [16]:
lr=LogisticRegression()

lr_grid = {"solver": ['newton-cg', 'lbfgs', 'liblinear'],
           "penalty": ['l2'],
           "C":[100, 10, 1.0, 0.1, 0.01]}

In [17]:
clasification(lr,lr_grid,iris_x_scaled,iris_y)

Unnamed: 0,Model,Random State,Best Parameters,f1_Score,Execution Time
0,LogisticRegression(),1.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.355995
1,LogisticRegression(),2.0,"{'C': 100, 'penalty': 'l2', 'solver': 'libline...",0.96,0.437394
2,LogisticRegression(),3.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.380656
3,LogisticRegression(),4.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.536361
4,LogisticRegression(),5.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.862696
5,LogisticRegression(),6.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.538051
6,LogisticRegression(),7.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.437398
7,LogisticRegression(),8.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.539035
8,LogisticRegression(),9.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.499882
9,LogisticRegression(),10.0,"{'C': 100, 'penalty': 'l2', 'solver': 'newton-...",0.979998,0.577987


## Random forests

In [18]:
rfc=RandomForestClassifier()

rfc_grid={'criterion' :['gini', 'entropy'],
         'max_depth' : [4,6,8],
         'n_estimators': [100,200,300,400,500],
         "n_jobs":[-1]}

In [None]:
clasification(rfc,rfc_grid,iris_x_scaled,iris_y)

## Ada boost

In [None]:
abc=AdaBoostClassifier()

abc_grid = {"n_estimators":[10, 50, 100, 500, 1000],
           "learning_rate":np.arange(0.1, 2.1, 0.4)}

In [None]:
clasification(abc,abc_grid,iris_x_scaled,iris_y)

## Gradiant boost

In [None]:
gbc = GradientBoostingClassifier()

gbc_grid = {"n_estimators":[10, 50, 100, 500, 1000],
            "max_depth":[1,3,5,7,9],
            "learning_rate":np.arange(0.1, 2.1, 0.4)}

In [None]:
clasification(gbc,gbc_grid,iris_x_scaled,iris_y)

## XGBoost

In [None]:
xgbc = XGBClassifier()

xgbc_grid = {"max_depth":range(1,10,2),
            "min_child_weight":range(1,6,2),
            "learning_rate":np.arange(0.1, 2.1, 0.4),
            "n_estimators":[10, 50, 100, 500, 1000]}

In [None]:
clasification(xgbc,xgbc_grid,iris_x_scaled,iris_y)

In [None]:
# def clasification(model,parameters,x,y):
#     for i in range(1,10):
#         cv_inner = KFold(n_splits=5, shuffle=True, random_state=i)
#         score = ['accuracy' ,'f1_macro']

#         grid_search = GridSearchCV(model, parameters, scoring=score, refit="accuracy", error_score=0)
#         grid_result = grid_search.fit(x,y)
#         print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

#         params = grid_result.cv_results_['params']
#         f1_score = grid_result.cv_results_['mean_test_f1_macro']

#     knc_df=pd.DataFrame({"Parameters":[],
#                          "f1_score":[]})

#     for x,y in zip(params,f1_score):
#         knc_df=knc_df.append({"Parameters":x,
#                               "f1_score":y},ignore_index = True)
        
#     return knc_df
    

In [None]:
wine = openml.datasets.get_dataset("wine")
wine_df, wine_label, categorical_indicator, attribute_names = wine.get_data(
    target= wine.default_target_attribute, dataset_format="dataframe"
)
wine_df["class"]=wine_label
wine_x=wine_df.iloc[:,:13]
wine_df