Data Preparation

In [137]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn import model_selection
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [138]:
# Load data from csv file
data = pd.read_csv("Coffee-modified.csv")
selected_data = data.loc[:,['Total.Cup.Points',
                            'Species',
                            'Country.of.Origin',
                            'Processing.Method',
                            'Aroma',
                            'Flavor',
                            'Aftertaste', 
                            'Acidity',
                            'Body',
                            'Balance',
                            'Uniformity',
                            'Moisture',
                            'altitude_mean_meters']]
selected_data

Unnamed: 0,Total.Cup.Points,Species,Country.of.Origin,Processing.Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Moisture,altitude_mean_meters
0,90.58,Arabica,Ethiopia,Washed / Wet,8.67,8.83,8.67,8.75,8.50,8.42,10.00,0.12,2075.00
1,89.92,Arabica,Ethiopia,Washed / Wet,8.75,8.67,8.50,8.58,8.42,8.42,10.00,0.12,2075.00
2,89.75,Arabica,Guatemala,,8.42,8.50,8.42,8.42,8.33,8.42,10.00,0.00,1700.00
3,89.00,Arabica,Ethiopia,Natural / Dry,8.17,8.58,8.42,8.42,8.50,8.25,10.00,0.11,2000.00
4,88.83,Arabica,Ethiopia,Washed / Wet,8.25,8.50,8.25,8.50,8.42,8.33,10.00,0.12,2075.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1306,68.33,Arabica,Mexico,Washed / Wet,7.08,6.83,6.25,7.42,7.25,6.75,10.00,0.11,900.00
1307,67.92,Arabica,Haiti,Natural / Dry,6.75,6.58,6.42,6.67,7.08,6.67,9.33,0.14,350.00
1308,63.08,Arabica,Nicaragua,Other,7.25,6.58,6.33,6.25,6.42,6.08,6.00,0.13,1100.00
1309,59.83,Arabica,Guatemala,Washed / Wet,7.50,6.67,6.67,7.67,7.33,6.67,8.00,0.10,1417.32


In [139]:
#Data Preparation
selected_data.dropna(inplace=True)
selected_data.describe()

Unnamed: 0,Total.Cup.Points,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Moisture,altitude_mean_meters
count,1003.0,1003.0,1003.0,1003.0,1003.0,1003.0,1003.0,1003.0,1003.0,1003.0
mean,82.126949,7.567488,7.513779,7.385633,7.526221,7.504287,7.499302,9.870219,0.094985,1816.792017
std,2.63161,0.301361,0.324487,0.332735,0.307827,0.272824,0.340801,0.445371,0.043468,9112.47293
min,59.83,5.08,6.17,6.17,5.25,6.33,6.08,6.0,0.0,1.0
25%,81.17,7.42,7.33,7.17,7.33,7.33,7.33,10.0,0.1,1100.0
50%,82.42,7.58,7.5,7.42,7.5,7.5,7.5,10.0,0.11,1310.64
75%,83.58,7.75,7.67,7.58,7.67,7.67,7.67,10.0,0.12,1600.0
max,90.58,8.75,8.83,8.67,8.75,8.5,8.58,10.0,0.17,190164.0


In [140]:
# Assign X, Y (drop datetime index)
Y = selected_data.iloc[:,0:1]
Y = Y.reset_index(drop=True)
X = selected_data.iloc[:,1:]
X = X.reset_index(drop=True)
X

Unnamed: 0,Species,Country.of.Origin,Processing.Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Moisture,altitude_mean_meters
0,Arabica,Ethiopia,Washed / Wet,8.67,8.83,8.67,8.75,8.50,8.42,10.00,0.12,2075.00
1,Arabica,Ethiopia,Washed / Wet,8.75,8.67,8.50,8.58,8.42,8.42,10.00,0.12,2075.00
2,Arabica,Ethiopia,Natural / Dry,8.17,8.58,8.42,8.42,8.50,8.25,10.00,0.11,2000.00
3,Arabica,Ethiopia,Washed / Wet,8.25,8.50,8.25,8.50,8.42,8.33,10.00,0.12,2075.00
4,Arabica,Ethiopia,Natural / Dry,8.08,8.58,8.50,8.50,7.67,8.42,10.00,0.10,1822.50
...,...,...,...,...,...,...,...,...,...,...,...,...
998,Arabica,Honduras,Washed / Wet,7.00,6.33,6.17,6.50,6.67,6.17,8.00,0.10,1450.00
999,Arabica,Mexico,Washed / Wet,7.08,6.83,6.25,7.42,7.25,6.75,10.00,0.11,900.00
1000,Arabica,Haiti,Natural / Dry,6.75,6.58,6.42,6.67,7.08,6.67,9.33,0.14,350.00
1001,Arabica,Nicaragua,Other,7.25,6.58,6.33,6.25,6.42,6.08,6.00,0.13,1100.00


In [141]:
# Prepare Y
# Process Y from values to Coffee Bean Grade
bean_grade = [0,0,0]
bean_amount = []
rating_pctile = np.percentile(Y,[75,90])
for i in range(len(Y.index)):
    #75 percentile
    if Y['Total.Cup.Points'][i] < rating_pctile [0]:
        bean_grade[0] +=  1
        bean_amount.append(1)
    # 90 percentile
    if rating_pctile [0] <= Y['Total.Cup.Points'][i] < rating_pctile [1]:
        bean_grade[1] +=  1
        bean_amount.append(2)
    # 100 percentile
    if Y['Total.Cup.Points'][i] >= rating_pctile[1]:
        bean_grade[2] +=  1
        bean_amount.append(3)
bean_amount = pd.DataFrame(data={'bean_grade':bean_amount})
bean_amount

Unnamed: 0,bean_grade
0,3
1,3
2,3
3,3
4,3
...,...
998,1
999,1
1000,1
1001,1


In [142]:
# Visualize Bar Graph of Number of Samples for each Bean Grade
# ตัวอย่างการลองใช้ plotly express library 

fig = px.bar( x = [1,2,3], y = bean_grade, color=bean_grade, range_y=[0.0,1000])
fig.show()

In [143]:
# Standardized data ( X [ numerical feature columns ])
x_string = X.select_dtypes(exclude='float64')
x_continuous = X.select_dtypes(include='float64')
standard_scaler = preprocessing.StandardScaler()
x_std = standard_scaler.fit_transform(x_continuous)
x_std

array([[ 3.66027395,  4.05834234,  3.86195047, ...,  0.29154442,
         0.57576249,  0.0283498 ],
       [ 3.92586906,  3.56500983,  3.35077911, ...,  0.29154442,
         0.57576249,  0.0283498 ],
       [ 2.00030453,  3.28751029,  3.11022788, ...,  0.29154442,
         0.34559518,  0.02011522],
       ...,
       [-2.71400863, -2.87914607, -2.90355288, ..., -1.21357018,
         1.03609711, -0.16104561],
       [-1.05403921, -2.87914607, -3.17417302, ..., -8.69421438,
         0.8059298 , -0.07869978],
       [-0.2240545 , -2.60164653, -2.15183029, ..., -4.20133498,
         0.11542787, -0.04385981]])

In [144]:
# feature selection (correlation)
x_std_df = pd.DataFrame(x_std,columns=x_continuous.columns)
# Calculate correlation between variables for only continuous data columns
corr_data = x_std_df.corr()
# Reduce Corr() to Lower Matrix
lower_tri = corr_data.where(np.tril(np.ones(corr_data.shape),k=-1).astype(np.bool))
lower_tri.fillna(0, inplace=True)
# Drop columns if |correlation value| > 0.8
to_drop = [column for column in lower_tri.columns if any(lower_tri[column] > 0.9)]

x_std_df.drop(columns=to_drop, inplace=True)
x_std_df


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations



Unnamed: 0,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Moisture,altitude_mean_meters
0,3.660274,4.058342,3.861950,3.977528,3.651469,2.702919,0.291544,0.575762,0.028350
1,3.925869,3.565010,3.350779,3.424993,3.358094,2.702919,0.291544,0.575762,0.028350
2,2.000305,3.287510,3.110228,2.904961,3.651469,2.203845,0.291544,0.345595,0.020115
3,2.265900,3.040844,2.599057,3.164977,3.358094,2.438704,0.291544,0.575762,0.028350
4,1.701510,3.287510,3.350779,3.164977,0.607701,2.702919,0.291544,0.115428,0.000627
...,...,...,...,...,...,...,...,...,...
998,-1.884024,-3.649978,-3.655275,-3.335427,-3.059490,-3.902470,-4.201335,0.115428,-0.040272
999,-1.618429,-2.108314,-3.414724,-0.345241,-0.932520,-2.199748,0.291544,0.345595,-0.100659
1000,-2.714009,-2.879146,-2.903553,-2.782892,-1.555942,-2.434606,-1.213570,1.036097,-0.161046
1001,-1.054039,-2.879146,-3.174173,-4.147977,-3.976288,-4.166686,-8.694214,0.805930,-0.078700


In [145]:
# One hot encoding for Categorical feature columns
x_string_df = pd.get_dummies(x_string, columns = x_string.columns, drop_first=True )


In [146]:
# Prepare X train, Xtest , Y train, Ytest
x_all = pd.concat([x_std_df,x_string_df], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x_all, bean_amount, test_size=0.33, random_state=42)

Model Preparation

In [148]:
#KNN
# KNN parameter
k = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 25, 35] # try at least 3 values

for i in range(3):
    k = (4*(i+1))+1
    print('---------------------------------------------------------')
    print('K = '+ str(k))
    
    # Model Training
    modelKNN = KNeighborsClassifier(n_neighbors=k, p=2)
    modelKNN.fit(x_train,y_train)

    # Model Testing
    y_pred= modelKNN.predict(x_test)
    KNN_Score = accuracy_score(y_test, y_pred)

    # Print Confusion Matrix and Classification Report for best k
    print('Confusion Matrix: ')
    print(confusion_matrix(y_test, y_pred))
    print('Classification Report: ')
    print(classification_report(y_test, y_pred))
    print('---------------------------------------------------------')

# Visualize compare accuracy of selected k values (at least 3 values of k)
fig = px.bar(x = k, y = KNN_Score, color=KNN_Score, range_y=[0.7,1.0])
fig.show()

---------------------------------------------------------
K = 5
Confusion Matrix: 
[[237  13   2]
 [ 12  29   1]
 [  1  14  22]]
Classification Report: 
              precision    recall  f1-score   support

           1       0.95      0.94      0.94       252
           2       0.52      0.69      0.59        42
           3       0.88      0.59      0.71        37

    accuracy                           0.87       331
   macro avg       0.78      0.74      0.75       331
weighted avg       0.89      0.87      0.87       331

---------------------------------------------------------



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



ValueError: String or int arguments are only possible when a DataFrame or an array is provided in the `data_frame` argument. No DataFrame was provided, but argument 'x' is of type str or int.