In [1]:
import pandas as pd
import openml
from sklearn.preprocessing import LabelEncoder #For encoding categorical variables
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold,GridSearchCV
from sklearn.metrics import f1_score, make_scorer

In [2]:
iris = openml.datasets.get_dataset("iris")
iris_df, iris_label, categorical_indicator, attribute_names = iris.get_data(
    target=iris.default_target_attribute, dataset_format="dataframe"
)
iris_df["class"]=iris_label
iris_x=iris_df.iloc[:,:4]
iris_df

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
wine = openml.datasets.get_dataset("wine")
wine_df, wine_label, categorical_indicator, attribute_names = wine.get_data(
    target= wine.default_target_attribute, dataset_format="dataframe"
)
wine_df["class"]=wine_label
wine_x=wine_df.iloc[:,:13]
wine_df

Unnamed: 0,Alcohol,Malic_acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280%2FOD315_of_diluted_wines,Proline,class
0,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050,1
2,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740,3
174,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750,3
175,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835,3
176,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840,3


In [4]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   sepallength  150 non-null    float64 
 1   sepalwidth   150 non-null    float64 
 2   petallength  150 non-null    float64 
 3   petalwidth   150 non-null    float64 
 4   class        150 non-null    category
dtypes: category(1), float64(4)
memory usage: 5.1 KB


In [5]:
le=LabelEncoder()

iris_y=le.fit_transform(iris_label)
wine_y=le.fit_transform(wine_label)

In [6]:
scaler = MinMaxScaler()

iris_x_scaled=scaler.fit_transform(iris_x)
wine_x_scaled=scaler.fit_transform(wine_x)

In [7]:
Dataset = ["Iris","Wine"]

In [25]:
def clasification(model,parameters,x,y):
    cv_inner = KFold(n_splits=5, shuffle=True, random_state=1)
    f1 = make_scorer(f1_score , average='macro')
    grid_search = GridSearchCV(model, parameters, cv=cv_inner, scoring=f1, error_score=0)
    grid_result = grid_search.fit(x,y)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [26]:
KNC = KNeighborsClassifier()

n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
KNC_grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)

clasification(KNC,KNC_grid,wine_x_scaled,wine_y)


Best: 0.979433 using {'metric': 'euclidean', 'n_neighbors': 17, 'weights': 'uniform'}


In [27]:
clasification(KNC,KNC_grid,iris_x_scaled,iris_y)

Best: 0.971534 using {'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'uniform'}


In [None]:
clasification(KNC,KNC_grid,iris_x_scaled,iris_y)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']


knc_df=pd.DataFrame({"Dataset":[],
                     "Parameters":[] ,
                     "mean_test_score":[],
                     "std_test_score":[]})

for x,y,z in zip(params,means,stds):
    knc_df=knc_df.append({"Dataset":Dataset[0],
                          "Parameters": x,
                          "mean_test_score":y,
                          "std_test_score":z},ignore_index = True)

knc_df