In [1]:
import pandas as pd
import numpy as np
import openml
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold,GridSearchCV
from sklearn.metrics import f1_score, make_scorer
import time

# Classification for Iris Dataset

In [2]:
iris = openml.datasets.get_dataset("iris")
iris_df, iris_label, categorical_indicator, attribute_names = iris.get_data(
    target=iris.default_target_attribute, dataset_format="dataframe"
)
iris_df["class"]=iris_label
iris_x=iris_df.iloc[:,:4]
iris_df

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   sepallength  150 non-null    float64 
 1   sepalwidth   150 non-null    float64 
 2   petallength  150 non-null    float64 
 3   petalwidth   150 non-null    float64 
 4   class        150 non-null    category
dtypes: category(1), float64(4)
memory usage: 5.1 KB


In [4]:
iris_df["class"].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: class, dtype: int64

In [5]:
le=LabelEncoder()

iris_y=le.fit_transform(iris_label)

In [6]:
scaler = MinMaxScaler()

iris_x_scaled=scaler.fit_transform(iris_x)

In [7]:
def clasification(model,parameters,x,y):
    
    classification_df=pd.DataFrame({"Model":[],
                                    "Random State":[],
                                    "Best Parameters":[],
                                    "Best Score":[],
                                    "Execution Time":[],
                                    "f1 score":[]})
    for i in range(1,11):
        start_time = time.time()
        cv_inner = KFold(n_splits=5, shuffle=True, random_state=i)
        score = ['accuracy' ,'f1_macro']
        grid_search = GridSearchCV(model, parameters, cv=cv_inner, scoring=score, refit='accuracy', n_jobs = -1)
        grid_result = grid_search.fit(x,y)
        f1_score = grid_result.cv_results_['mean_test_f1_macro']
        execution_time = time.time() - start_time
        print("Best: %f using %s and Execution time is : %f"% (grid_result.best_score_, grid_result.best_params_,execution_time))
        classification_df=classification_df.append({"Model":model,
                                                    "Random State":i,
                                                    "Best Parameters":grid_result.best_params_,
                                                    "Best Score":grid_result.best_score_,
                                                    "Execution Time":execution_time,
                                                    "f1 score":f1_score},ignore_index=True)
         
       
    if model == KNC:
        classification_df.to_csv("Iris_KNC.csv")
    elif model == LDA:
        classification_df.to_csv("Iris_LDA.csv")
    elif model == GNB:
        classification_df.to_csv("Iris_GNB.csv")
    elif model == s_v_c:
        classification_df.to_csv("Iris_SVC.csv")
    elif model == LR:
        classification_df.to_csv("Iris_LR.csv")
    elif model == RF:
        classification_df.to_csv("Iris_RF.csv")


    return classification_df
    

## K-nearest neighbour classification

In [10]:
KNC = KNeighborsClassifier()

KNC_grid = {"n_neighbors":range(1, 21),
           "weights":['uniform', 'distance'],
           "metric":['manhattan','euclidean', 'minkowski']}

In [11]:
clasification(KNC,KNC_grid,iris_x_scaled,iris_y)

Best: 0.973333 using {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'} and Execution time is : 0.962328
Best: 0.966667 using {'metric': 'euclidean', 'n_neighbors': 6, 'weights': 'uniform'} and Execution time is : 0.998794
Best: 0.966667 using {'metric': 'euclidean', 'n_neighbors': 6, 'weights': 'uniform'} and Execution time is : 0.952480
Best: 0.973333 using {'metric': 'euclidean', 'n_neighbors': 14, 'weights': 'uniform'} and Execution time is : 0.959081
Best: 0.973333 using {'metric': 'euclidean', 'n_neighbors': 14, 'weights': 'uniform'} and Execution time is : 0.964186
Best: 0.966667 using {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'} and Execution time is : 0.887399
Best: 0.966667 using {'metric': 'euclidean', 'n_neighbors': 2, 'weights': 'uniform'} and Execution time is : 0.978752
Best: 0.966667 using {'metric': 'euclidean', 'n_neighbors': 6, 'weights': 'uniform'} and Execution time is : 0.942374
Best: 0.966667 using {'metric': 'manhattan', 'n_neighb

Unnamed: 0,Model,Random State,Best Parameters,Best Score,Execution Time,f1 score
0,KNeighborsClassifier(),1.0,"{'metric': 'euclidean', 'n_neighbors': 9, 'wei...",0.973333,0.962328,"[0.9221037336254728, 0.9221037336254728, 0.928..."
1,KNeighborsClassifier(),2.0,"{'metric': 'euclidean', 'n_neighbors': 6, 'wei...",0.966667,0.998794,"[0.9401705761969593, 0.9401705761969593, 0.931..."
2,KNeighborsClassifier(),3.0,"{'metric': 'euclidean', 'n_neighbors': 6, 'wei...",0.966667,0.95248,"[0.9384541910331384, 0.9384541910331384, 0.938..."
3,KNeighborsClassifier(),4.0,"{'metric': 'euclidean', 'n_neighbors': 14, 'we...",0.973333,0.959081,"[0.9308426681677456, 0.9308426681677456, 0.942..."
4,KNeighborsClassifier(),5.0,"{'metric': 'euclidean', 'n_neighbors': 14, 'we...",0.973333,0.964186,"[0.9471420500151151, 0.9471420500151151, 0.960..."
5,KNeighborsClassifier(),6.0,"{'metric': 'euclidean', 'n_neighbors': 5, 'wei...",0.966667,0.887399,"[0.9403688419685189, 0.9403688419685189, 0.945..."
6,KNeighborsClassifier(),7.0,"{'metric': 'euclidean', 'n_neighbors': 2, 'wei...",0.966667,0.978752,"[0.9416315383514199, 0.9416315383514199, 0.955..."
7,KNeighborsClassifier(),8.0,"{'metric': 'euclidean', 'n_neighbors': 6, 'wei...",0.966667,0.942374,"[0.9409770067505967, 0.9409770067505967, 0.953..."
8,KNeighborsClassifier(),9.0,"{'metric': 'manhattan', 'n_neighbors': 18, 'we...",0.966667,1.108416,"[0.9388938790677919, 0.9388938790677919, 0.952..."
9,KNeighborsClassifier(),10.0,"{'metric': 'euclidean', 'n_neighbors': 6, 'wei...",0.96,1.018761,"[0.9305542939583861, 0.9305542939583861, 0.941..."


## Linear discriminant analysis 

In [None]:
LDA = LinearDiscriminantAnalysis()

LDA_grid = {"n_components": list(range(1,3,1)),
           "solver":['lsqr','eigen','svd']}

In [None]:
clasification(LDA,LDA_grid,iris_x_scaled,iris_y)

## Naïve Bayes classifier

In [None]:
GNB = GaussianNB()

GNB_grid={"var_smoothing":np.logspace(0,-9, num=100)}

In [None]:
clasification(GNB,GNB_grid,iris_x_scaled,iris_y)

## Support vector machine

In [None]:
s_v_c=SVC()

SVC_grid={"C":[0.01, 0.1, 1, 10, 100, 1000],
          "kernel":["linear","poly","rbf","sigmoid"],
          "gamma":[1, 0.1, 0.01, 0.001, 0.0001]}


In [None]:
clasification(s_v_c,SVC_grid,iris_x_scaled,iris_y)

## Logistic regression

In [None]:
LR=LogisticRegression()

LR_grid = {"solver": ['newton-cg', 'lbfgs', 'liblinear'],
           "penalty": ['l2'],
           "C":[100, 10, 1.0, 0.1, 0.01]}

In [None]:
clasification(LR,LR_grid,iris_x_scaled,iris_y)

## Random forests

In [16]:
RF=RandomForestClassifier()

RF_grid={'criterion' :['gini', 'entropy'],
         'max_depth' : [4,6,8],
         'n_estimators': [200,500]}

In [17]:
clasification(RF,RF_grid,iris_x_scaled,iris_y)

Best: 0.946667 using {'criterion': 'gini', 'max_depth': 4, 'n_estimators': 200} and Execution time is : 17.017982
Best: 0.960000 using {'criterion': 'gini', 'max_depth': 4, 'n_estimators': 200} and Execution time is : 15.839628
Best: 0.960000 using {'criterion': 'gini', 'max_depth': 4, 'n_estimators': 500} and Execution time is : 15.313038
Best: 0.966667 using {'criterion': 'gini', 'max_depth': 4, 'n_estimators': 200} and Execution time is : 14.468299
Best: 0.960000 using {'criterion': 'gini', 'max_depth': 8, 'n_estimators': 200} and Execution time is : 14.436383
Best: 0.946667 using {'criterion': 'entropy', 'max_depth': 4, 'n_estimators': 200} and Execution time is : 14.054398
Best: 0.966667 using {'criterion': 'entropy', 'max_depth': 4, 'n_estimators': 200} and Execution time is : 13.738299
Best: 0.960000 using {'criterion': 'gini', 'max_depth': 8, 'n_estimators': 500} and Execution time is : 14.495225
Best: 0.960000 using {'criterion': 'gini', 'max_depth': 4, 'n_estimators': 200} an

NameError: name 'LDA' is not defined

In [None]:
'max_features': ['auto', 'sqrt', 'log2'],

In [None]:
# def clasification(model,parameters,x,y):
#     for i in range(1,10):
#         cv_inner = KFold(n_splits=5, shuffle=True, random_state=i)
#         score = ['accuracy' ,'f1_macro']

#         grid_search = GridSearchCV(model, parameters, scoring=score, refit="accuracy", error_score=0)
#         grid_result = grid_search.fit(x,y)
#         print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

#         params = grid_result.cv_results_['params']
#         f1_score = grid_result.cv_results_['mean_test_f1_macro']

#     knc_df=pd.DataFrame({"Parameters":[],
#                          "f1_score":[]})

#     for x,y in zip(params,f1_score):
#         knc_df=knc_df.append({"Parameters":x,
#                               "f1_score":y},ignore_index = True)
        
#     return knc_df
    

In [None]:
KNC = KNeighborsClassifier(n_jobs=-1)

n_neighbors = range(1, 21)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
KNC_grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)



In [None]:
def clasification(model,parameters,x,y):
    cv_inner = KFold(n_splits=5, shuffle=True, random_state=1)
    score = ['accuracy' ,'f1_macro']

    grid_search = GridSearchCV(model, parameters, scoring=score, refit="accuracy", error_score=0)
    grid_result = grid_search.fit(x,y)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

    params = grid_result.cv_results_['params']
    f1_score = grid_result.cv_results_['mean_test_f1_macro']

    knc_df=pd.DataFrame({"Parameters":[],
                         "f1_score":[]})

    for x,y in zip(params,f1_score):
        knc_df=knc_df.append({"Parameters":x,
                              "f1_score":y},ignore_index = True)
        
    return knc_df

In [None]:
clasification(LDA,LDA_grid,iris_x_scaled,iris_y)

In [None]:
model = GaussianNB()

priors=range(1,5)
var_smoothing= np.logspace(0,-9, num=100)
parameters=dict(var_smoothing=var_smoothing)

cv_inner = KFold(n_splits=5, shuffle=True, random_state=1)
score = ['accuracy' ,'f1_macro']

grid_search = GridSearchCV(model, parameters, scoring=score, refit="accuracy", error_score=0)
grid_result = grid_search.fit(iris_x_scaled,iris_y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

params = grid_result.cv_results_['params']
f1_score = grid_result.cv_results_['mean_test_f1_macro']

knc_df=pd.DataFrame({"Parameters":[],
                         "f1_score":[]})

for x,y in zip(params,f1_score):
    knc_df=knc_df.append({"Parameters":x,
                          "f1_score":y},ignore_index = True)

knc_df


In [None]:
wine = openml.datasets.get_dataset("wine")
wine_df, wine_label, categorical_indicator, attribute_names = wine.get_data(
    target= wine.default_target_attribute, dataset_format="dataframe"
)
wine_df["class"]=wine_label
wine_x=wine_df.iloc[:,:13]
wine_df