In [0]:
import numpy as np
import sklearn 
import pandas as pd
from sklearn import neighbors
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

np.random.seed(10) #set random seed for repeatable results
mu, sigma = 30, 5.3

#generate unripe data
unripe_data = np.array([np.random.normal(mu, sigma, 200),np.random.normal(57.3, 3.06, 200),np.random.normal(23.67, 3.79, 200),np.full((200,),0,dtype=int)])
np.std(unripe_data), np.mean(unripe_data)
unripe_transposed = unripe_data.transpose()
unripe_transposed.shape #check shape
unripe_transposed[:,2].mean() #check means

#generate ripe data
ripe_data = np.array([np.random.normal(35.61, 5.53, 200),np.random.normal(245.29, 51.38, 200),np.random.normal(40, 5.81, 200),np.full((200,),1,dtype=int)])
ripe_transposed = ripe_data.transpose()

#generate rotten data
rotten_data = np.array([np.random.normal(38.7, 5.26, 200),np.random.normal(344.46, 58.64, 200),np.random.normal(48.4, 6.24, 200),np.full((200,),2,dtype=int)])
rotten_transposed = rotten_data.transpose()

#merge data
total_data = np.vstack((unripe_transposed,ripe_transposed,rotten_transposed))
total_data.shape

(600, 4)

In [0]:
total_data #display total data

array([[ 37.05740847,  57.70740074,  25.42244124,   0.        ],
       [ 33.79097856,  60.98039625,  19.04674753,   0.        ],
       [ 21.80937845,  54.16425592,  24.39985106,   0.        ],
       ...,
       [ 42.73420602, 269.96875715,  49.88749687,   2.        ],
       [ 40.60310314, 322.95280019,  48.47711512,   2.        ],
       [ 41.87325782, 260.52469161,  44.99097768,   2.        ]])

In [0]:
#shuffle dataset
total_data_mixed = np.random.RandomState(seed=42).permutation(total_data)
total_data_mixed

array([[ 31.55258584,  65.50052554,  22.44824236,   0.        ],
       [ 35.66513613, 313.19473102,  54.71417327,   2.        ],
       [ 38.27772415, 285.60861945,  41.49400562,   2.        ],
       ...,
       [ 28.4811404 , 190.9043477 ,  42.79870512,   1.        ],
       [ 39.25830545, 434.85551737,  43.10904701,   2.        ],
       [ 25.1085809 ,  56.56573373,  27.16180443,   0.        ]])

In [0]:
#generate pandas dataframe from dataset
df = pd.DataFrame(total_data_mixed,
                   columns=['MQ-135', 'MQ-3', 'MQ-2','Class'])
df.head(10) #display first 10 rows of dataframe

Unnamed: 0,MQ-135,MQ-3,MQ-2,Class
0,31.552586,65.500526,22.448242,0.0
1,35.665136,313.194731,54.714173,2.0
2,38.277724,285.608619,41.494006,2.0
3,33.289184,54.331195,20.797826,0.0
4,26.401582,59.86147,16.976235,0.0
5,42.643168,269.412715,36.679642,1.0
6,32.295039,56.071493,23.385944,0.0
7,48.31323,436.418057,61.506378,2.0
8,24.460798,61.024576,27.338001,0.0
9,30.43442,270.577579,46.882263,1.0


In [0]:
#Last column in dataframe is the target column which holds our labels. We do not want labels to be part of our X data which is our feature data so exclude that row in knn
feature_data = df[['MQ-135', 'MQ-3', 'MQ-2']]
target_data = df[['Class']]

target_data_converted = target_data.to_numpy().ravel()

#Split 30% of data as test data
X_train, X_test, y_train, y_test = train_test_split(feature_data, target_data_converted, test_size=0.3, random_state=0)


knn = neighbors.KNeighborsClassifier(n_neighbors=3) #try 3 neighbours as a test

score = cross_val_score(knn, X_train, y_train, cv=10)

print("Mean cross validation accuracy for k=3 is %f (+/- %0.2f)" % (score.mean(), score.std() * 2))

#Find the best k for 100 closest neighbours
bestk=0
for i in range(1,101):
  knn = neighbors.KNeighborsClassifier(n_neighbors=i)
  current_score = cross_val_score(knn, X_train, y_train, cv=10)
  if current_score.mean() > bestk:
    bestk = current_score.mean()
    index = i

#Use model on test set
knn = neighbors.KNeighborsClassifier(n_neighbors=index)
knn.fit(X_train, y_train)
accuracy= accuracy_score(y_test, knn.predict(X_test))
print('Best accuracy of',bestk,'is given by k =',index,'when up to 100 nearest neighbours are considered \nTest set accuracy is:',accuracy*100,'%')

Mean cross validation accuracy for k=3 is 0.897619 (+/- 0.06)
Best accuracy of 0.9166666666666666 is given by k = 32 when up to 100 nearest neighbours are considered 
Test set accuracy is: 90.55555555555556 %


In [0]:
#Enter actual data to predict in the actual_data array
#with format [MQ-135 reading, MQ-3 reading, MQ-2 reading]
actual_data = np.array([[34,58,28],[35,222,40],[38,351,48]])
predictions = knn.predict(actual_data)

for i in range(len(predictions)):
  if predictions[i] == 0:
    print('Entry number', i+1,'is an unripe banana')
  if predictions[i] == 1:
    print('Entry number', i+1,'is a ripe banana')
  if predictions[i] == 2:
    print('Entry number', i+1,'is a rotten banana')  

Entry number 1 is an unripe banana
Entry number 2 is a ripe banana
Entry number 3 is a rotten banana
