In [181]:
#import Libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut

In [220]:
#get DataSet
fruits = pd.read_csv("fruit_data_with_colours.csv")

In [221]:
fruits.head(5)

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [222]:
print(min(fruits['width']))
Fruit_name =  dict(zip(fruits.fruit_label.unique(),fruits.fruit_name.unique() ))

5.8


In [223]:
print(type(Fruit_name))
print(Fruit_name)

<class 'dict'>
{1: 'apple', 2: 'mandarin', 3: 'orange', 4: 'lemon'}


In [224]:
#get independent_variable (Features)
x = fruits[['mass','width','height']]

In [225]:
x.head(5)

Unnamed: 0,mass,width,height
0,192,8.4,7.3
1,180,8.0,6.8
2,176,7.4,7.2
3,86,6.2,4.7
4,84,6.0,4.6


In [126]:
#get dependent_variable (Response)
y = fruits['fruit_label']
y.head()

0    1
1    1
2    1
3    2
4    2
Name: fruit_label, dtype: int64

In [127]:
#Split data 
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=42)

# KFold: before pre-processing

In [128]:
def get_score(x_train,x_test,y_train,y_test):
    knn = KNeighborsClassifier(n_neighbors=3)
    #train_data
    knn.fit(x_train,y_train)
    #test_data
    return knn.score(x_test,y_test)

In [129]:
mean_scores={}
for k in range(5,10):
    kf = KFold(n_splits=k,shuffle=True)
    scores=[]
    for train_index, test_index in kf.split(x):
        x_train,x_test,y_train,y_test=x.iloc[train_index],x.iloc[test_index]\
        ,y.iloc[train_index],y.iloc[test_index]
        scores.append(get_score(x_train,x_test,y_train,y_test))
    print(scores)
    
    mean_scores[k]=(np.min(scores)+np.max(scores))/2
max_k_score = max(mean_scores,key=mean_scores.get)
print('K with max score: ',max_k_score)
print('max score: ',mean_scores[max_k_score])

[0.8333333333333334, 0.5, 0.5833333333333334, 0.8333333333333334, 0.6363636363636364]
[0.5, 0.5, 0.6, 0.8, 0.8, 1.0]
[0.7777777777777778, 0.4444444444444444, 0.6666666666666666, 0.75, 0.75, 0.625, 0.75]
[0.625, 0.5, 0.625, 0.8571428571428571, 0.8571428571428571, 0.8571428571428571, 0.2857142857142857, 0.8571428571428571]
[0.2857142857142857, 1.0, 0.5714285714285714, 0.42857142857142855, 0.7142857142857143, 1.0, 0.6666666666666666, 0.5, 0.6666666666666666]
K with max score:  6
max score:  0.75


# pre-processing data

In [226]:
cols = list(x.columns)
cols

['mass', 'width', 'height']

In [227]:
#calculate the mean
mean = x.mean(axis=0)
mean

mass      163.118644
width       7.105085
height      7.693220
dtype: float64

In [228]:
# calculate the variance
var_x = pd.DataFrame(columns=cols)
for col in cols:
    var_x[col]=(x[col]-mean[col])**2
var_x=var_x.mean()
var_x

mass      2975.765585
width        0.656076
height       1.820971
dtype: float64

In [229]:
# calculate std
std = np.sqrt(var_x)
std

mass      54.550578
width      0.809985
height     1.349434
dtype: float64

In [230]:
new_x = pd.DataFrame(columns=cols)
for col in cols:
    new_x[col]=(x[col]-mean[col])/std[col]
new_x.head()

Unnamed: 0,mass,width,height
0,0.529442,1.59869,-0.291397
1,0.309462,1.104854,-0.661922
2,0.236136,0.3641,-0.365502
3,-1.413709,-1.117409,-2.218131
4,-1.450372,-1.364327,-2.292236


# Kfold: after pre-processing

In [248]:
mean_scores={}
for k in range(5,10):
    kf = KFold(n_splits=k,shuffle=True)
    scores=[]
    for train_index, test_index in kf.split(new_x):
        x_train,x_test,y_train,y_test=new_x.iloc[train_index],new_x.iloc[test_index]\
        ,y.iloc[train_index],y.iloc[test_index]
        scores.append(get_score(x_train,x_test,y_train,y_test))
    print(scores)
    
    mean_scores[k]=(np.min(scores)+np.max(scores))/2
max_k_score = max(mean_scores,key=mean_scores.get)
print('K with max score: ',max_k_score)
print('max score: ',mean_scores[max_k_score])

[0.9166666666666666, 0.75, 0.9166666666666666, 0.8333333333333334, 0.9090909090909091]
[0.8, 1.0, 0.9, 0.8, 0.8, 0.8888888888888888]
[0.7777777777777778, 1.0, 1.0, 0.75, 0.875, 0.75, 0.75]
[0.875, 0.875, 0.75, 0.5714285714285714, 0.5714285714285714, 1.0, 0.7142857142857143, 0.8571428571428571]
[1.0, 0.7142857142857143, 0.8571428571428571, 0.8571428571428571, 0.8571428571428571, 1.0, 0.8333333333333334, 0.8333333333333334, 0.6666666666666666]
K with max score:  6
max score:  0.9
