In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
# Importing the dataset

dataset = pd.read_csv('fruit_data_with_colors.txt', sep='\t')
dataset.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [14]:
# create a dictionary for the fruit names

dict_fruits = dict(zip(dataset['fruit_label'].unique(), dataset['fruit_name'].unique()))
dict_fruits

{1: 'apple', 2: 'mandarin', 3: 'orange', 4: 'lemon'}

In [4]:
dataset.describe()

Unnamed: 0,fruit_label,mass,width,height,color_score
count,59.0,59.0,59.0,59.0,59.0
mean,2.542373,163.118644,7.105085,7.69322,0.762881
std,1.208048,55.018832,0.816938,1.361017,0.076857
min,1.0,76.0,5.8,4.0,0.55
25%,1.0,140.0,6.6,7.2,0.72
50%,3.0,158.0,7.2,7.6,0.75
75%,4.0,177.0,7.5,8.2,0.81
max,4.0,362.0,9.6,10.5,0.93


In [None]:
sns.pairplot(dataset, hue='fruit_name')

In [4]:
# Create the training and testing sets

X = dataset.iloc[:,3:6]
y= dataset.iloc[:, 0]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
type(X)

pandas.core.frame.DataFrame

In [6]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)

In [7]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [8]:
knn.score(X_test, y_test)

0.6666666666666666

In [9]:
type(X_train)

pandas.core.frame.DataFrame

In [None]:
# Plotting the decision boundaries of the K-NN Classifier

import graphviz
from adspy_shared_utilities import plot_fruit_knn

plot_fruit_knn(X_train, y_train, 5, 'uniform')

In [None]:
# Plotting the R^2 with different number of K neighbors

score=[]

for i in range(1,30):
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    score.append(knn.score(X_test, y_test))

plt.figure(figsize=(12,8))
plt.plot(range(1,30), score, color='blue', linestyle='dashed', marker='o', markerfacecolor='red', markersize=10)
plt.xlabel('Number of K Neighbors')
plt.ylabel('R^2')
plt.show()

In [None]:
# Plotting accuracy vs. Size of Training set

t = [0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]
accuracy=[]
knn = KNeighborsClassifier(n_neighbors=5)

for s in t:
    for i in range(1,1000):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 1-s)
        knn.fit(X_train, y_train)
        accuracy.append(knn.score(X_test, y_test))
    plt.plot(s, np.mean(accuracy), color='blue', linestyle='dashed', marker='o', markerfacecolor='red', markersize=10)

plt.xlabel('Training Set Proportion %')
plt.ylabel('Accuracy')