#### KNN

In [159]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [160]:
data=pd.read_csv('C:/Users/Administrator/2023_Data_Mining_Coffee_Quality_Dataset/cleaned_data.csv')
data.drop("Unnamed: 0",axis=1,inplace=True)

In [161]:
Y=data["Total.Cup.Points"]
X=data.drop("Total.Cup.Points",axis=1)
feature_names=X.columns
X.head()

Unnamed: 0,Species,Country.of.Origin,Region,Harvest.Year,Variety,Processing.Method,Category.One.Defects,Quakers,Color,Category.Two.Defects,altitude_mean_meters
0,0,8,115,5,5,4,0,0.0,2,0,2075.0
1,0,8,115,5,15,4,0,0.0,2,1,2075.0
2,0,9,122,3,2,4,0,0.0,2,0,1700.0
3,0,8,226,5,5,0,0,0.0,2,2,2000.0
4,0,8,115,5,15,4,0,0.0,2,2,2075.0


In [162]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, stratify=Y, random_state=42)
print(X_train.shape, X_test.shape)

(888, 11) (438, 11)


Normalizacija podataka

In [163]:
#Analiza outlier-a preko IQR-a
def IQR(data, feature_names):
    
    iqr = pd.DataFrame(0, index=feature_names, columns=['lower', 'min', 'num_lower', 'upper', 'max', 'num_upper', 'percantage'])
    for name in feature_names:

        (Q1, Q3) = X[name].quantile([0.25, 0.75])
        IQR = Q3 - Q1
        upper = Q3 + (1.5 * IQR)
        lower = Q1 - (1.5 * IQR)
        
        iqr.loc[name, 'upper'] = upper
        iqr.loc[name, 'lower'] = lower

        num_of_out_lower = (X[name] < lower).sum()
        num_of_out_upper = (X[name] > upper).sum()

        percentage = round((num_of_out_lower + num_of_out_upper) / X.shape[0] * 100)

        iqr.loc[name, 'num_lower'], iqr.loc[name, 'num_upper'], iqr.loc[name, 'percantage']  = num_of_out_lower, num_of_out_upper, percentage
        iqr.loc[name, 'min'], iqr.loc[name, 'max'] = min(X[name]), max(X[name])    
    return iqr

In [164]:
IQR(X_train,feature_names)

Unnamed: 0,lower,min,num_lower,upper,max,num_upper,percantage
Species,0.0,0,0,0.0,1,28,2
Country.of.Origin,-21.875,0,0,45.125,35,0,0
Region,-90.5,0,0,473.5,355,0,0
Harvest.Year,-1.5,0,0,10.5,9,0,0
Variety,-10.0,0,0,30.0,28,0,0
Processing.Method,1.5,0,281,5.5,4,0,21
Category.One.Defects,0.0,0,0,0.0,63,199,15
Quakers,0.0,0,0,0.0,11,93,7
Color,2.0,0,198,2.0,2,0,15
Category.Two.Defects,-6.0,0,0,10.0,47,90,7


Najvise outliera se javlja u Processing Method koloni koja je kategorickog tipa, i ovi elementi van granica predstavljaju druge metode za procesuiranje zrna kafe - nisu toliko dominantne
Prvo cemo probati sa ne obradjenim outlierima

In [165]:
#MinMax normalizacija
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [166]:
#Pomocna fja 
from sklearn.metrics import classification_report
from termcolor import colored

def report(model, x, y, text = "training"):
    y_pred = model.predict(x)
    
    print(colored("Classification report for model {} on {} data".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(classification_report(y, y_pred, zero_division=True))
    print("---------------------------------------------------------------------------------")
    
    print(colored("Confusion matrix for model {} on {} data ".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(pd.DataFrame(confusion_matrix(y, y_pred), columns=[4,5,6,7,8,9], index=[4,5,6,7,8,9]))
    print("---------------------------------------------------------------------------------")

In [167]:
#KNN
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(n_neighbors=10)

In [168]:
knn.fit(X_train, Y_train)

In [169]:
report(knn,X_train,Y_train)

[32mClassification report for model KNeighborsClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

         4.0       1.00      0.27      0.42        15
         5.0       0.50      0.25      0.33        32
         6.0       0.52      0.36      0.43       194
         7.0       0.69      0.90      0.78       532
         8.0       0.54      0.20      0.29       104
         9.0       1.00      0.00      0.00        11

    accuracy                           0.66       888
   macro avg       0.71      0.33      0.38       888
weighted avg       0.64      0.66      0.62       888

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on training data [0m
---------------------------------------------------------------------------------
   4  5   6    7   8  9
4  4  0   3    8   0  0
5  0  8  

In [170]:
report(knn, X_test, Y_test, "test")

[32mClassification report for model KNeighborsClassifier on test data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

         4.0       0.00      0.00      1.00         8
         5.0       0.23      0.19      0.21        16
         6.0       0.27      0.19      0.22        95
         7.0       0.62      0.81      0.70       263
         8.0       0.50      0.12      0.19        51
         9.0       1.00      0.00      0.00         5

    accuracy                           0.55       438
   macro avg       0.44      0.22      0.39       438
weighted avg       0.51      0.55      0.52       438

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on test data [0m
---------------------------------------------------------------------------------
   4  5   6    7  8  9
4  0  3   1    4  0  0
5  0  3   3   10  0

Nije bas sjajno

In [171]:
#Namestanje parametara
from sklearn.model_selection import GridSearchCV
params_grid = {'n_neighbors': range(5, 20),
               'weights': ['uniform', 'distance'],
                'p': [1, 2],
                'algorithm': ['auto' ,'kd_tree']}

In [172]:
estimator = GridSearchCV(KNeighborsClassifier(), param_grid=params_grid, cv=3, verbose=4)

In [173]:
estimator.fit(X_train, Y_train)

Fitting 3 folds for each of 120 candidates, totalling 360 fits
[CV 1/3] END algorithm=auto, n_neighbors=5, p=1, weights=uniform;, score=0.547 total time=   0.0s
[CV 2/3] END algorithm=auto, n_neighbors=5, p=1, weights=uniform;, score=0.510 total time=   0.0s
[CV 3/3] END algorithm=auto, n_neighbors=5, p=1, weights=uniform;, score=0.571 total time=   0.0s
[CV 1/3] END algorithm=auto, n_neighbors=5, p=1, weights=distance;, score=0.541 total time=   0.0s
[CV 2/3] END algorithm=auto, n_neighbors=5, p=1, weights=distance;, score=0.527 total time=   0.0s
[CV 3/3] END algorithm=auto, n_neighbors=5, p=1, weights=distance;, score=0.547 total time=   0.0s
[CV 1/3] END algorithm=auto, n_neighbors=5, p=2, weights=uniform;, score=0.530 total time=   0.0s
[CV 2/3] END algorithm=auto, n_neighbors=5, p=2, weights=uniform;, score=0.534 total time=   0.0s
[CV 3/3] END algorithm=auto, n_neighbors=5, p=2, weights=uniform;, score=0.541 total time=   0.0s
[CV 1/3] END algorithm=auto, n_neighbors=5, p=2, wei

[CV 2/3] END algorithm=auto, n_neighbors=6, p=2, weights=distance;, score=0.524 total time=   0.0s
[CV 3/3] END algorithm=auto, n_neighbors=6, p=2, weights=distance;, score=0.537 total time=   0.0s
[CV 1/3] END algorithm=auto, n_neighbors=7, p=1, weights=uniform;, score=0.551 total time=   0.0s
[CV 2/3] END algorithm=auto, n_neighbors=7, p=1, weights=uniform;, score=0.541 total time=   0.0s
[CV 3/3] END algorithm=auto, n_neighbors=7, p=1, weights=uniform;, score=0.568 total time=   0.0s
[CV 1/3] END algorithm=auto, n_neighbors=7, p=1, weights=distance;, score=0.557 total time=   0.0s
[CV 2/3] END algorithm=auto, n_neighbors=7, p=1, weights=distance;, score=0.544 total time=   0.0s
[CV 3/3] END algorithm=auto, n_neighbors=7, p=1, weights=distance;, score=0.541 total time=   0.0s
[CV 1/3] END algorithm=auto, n_neighbors=7, p=2, weights=uniform;, score=0.557 total time=   0.0s
[CV 2/3] END algorithm=auto, n_neighbors=7, p=2, weights=uniform;, score=0.547 total time=   0.0s
[CV 3/3] END al

In [174]:
print(estimator.best_params_)
print(estimator.best_score_)

{'algorithm': 'auto', 'n_neighbors': 19, 'p': 1, 'weights': 'uniform'}
0.579954954954955


In [175]:
report(estimator.best_estimator_,X_train,Y_train)

[32mClassification report for model KNeighborsClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

         4.0       1.00      0.00      0.00        15
         5.0       1.00      0.00      0.00        32
         6.0       0.53      0.24      0.33       194
         7.0       0.63      0.94      0.75       532
         8.0       0.00      0.00      1.00       104
         9.0       1.00      0.00      0.00        11

    accuracy                           0.62       888
   macro avg       0.69      0.20      0.35       888
weighted avg       0.56      0.62      0.64       888

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on training data [0m
---------------------------------------------------------------------------------
   4  5   6    7  8  9
4  0  0   1   14  0  0
5  0  0   5

In [176]:
report(estimator.best_estimator_,X_test,Y_test)

[32mClassification report for model KNeighborsClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

         4.0       1.00      0.00      0.00         8
         5.0       0.00      0.00      1.00        16
         6.0       0.38      0.20      0.26        95
         7.0       0.62      0.90      0.73       263
         8.0       0.50      0.02      0.04        51
         9.0       1.00      0.00      0.00         5

    accuracy                           0.59       438
   macro avg       0.58      0.19      0.34       438
weighted avg       0.54      0.59      0.54       438

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on training data [0m
---------------------------------------------------------------------------------
   4  5   6    7  8  9
4  0  0   2    6  0  0
5  0  0   4

In [177]:
#Slicni rezultati

In [178]:
#Probajmo da balansiramo podatke

In [179]:
from collections import Counter
from sklearn.decomposition import PCA

def resample(sampler, X, Y):
    X_resampled, y_resampled = sampler.fit_resample(X, Y)
    print(colored("Number of instances before resampling with {} : {}. ".format(type(sampler).__name__,\
                                                                           Counter(Y).items()), "green"))
    print(colored("Number of instances after  resampling with {} : {}. ".format(type(sampler).__name__,\
                                                                           Counter(y_resampled).items()), "blue"))
    return X_resampled, y_resampled
    

In [180]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(k_neighbors=10, random_state=42)

In [181]:
X_resampled, Y_resampled = resample(smote, X_train, Y_train)

[32mNumber of instances before resampling with SMOTE : dict_items([(8.0, 104), (7.0, 532), (6.0, 194), (4.0, 15), (9.0, 11), (5.0, 32)]). [0m
[34mNumber of instances after  resampling with SMOTE : dict_items([(8.0, 532), (7.0, 532), (6.0, 532), (4.0, 532), (9.0, 532), (5.0, 532)]). [0m


In [182]:
knn_balanced = KNeighborsClassifier(n_neighbors=10)

In [183]:
knn_balanced.fit(X_resampled,Y_resampled)

In [184]:
report(knn_balanced,X_resampled,Y_resampled)

[32mClassification report for model KNeighborsClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

         4.0       0.88      0.95      0.91       532
         5.0       0.82      0.92      0.87       532
         6.0       0.67      0.69      0.68       532
         7.0       0.77      0.46      0.58       532
         8.0       0.77      0.80      0.79       532
         9.0       0.89      0.99      0.94       532

    accuracy                           0.80      3192
   macro avg       0.80      0.80      0.79      3192
weighted avg       0.80      0.80      0.79      3192

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on training data [0m
---------------------------------------------------------------------------------
     4    5    6    7    8    9
4  508    3   10    1    

In [185]:
report(knn_balanced,X_test,Y_test)

[32mClassification report for model KNeighborsClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

         4.0       0.02      0.12      0.04         8
         5.0       0.12      0.38      0.19        16
         6.0       0.26      0.32      0.28        95
         7.0       0.71      0.32      0.44       263
         8.0       0.24      0.41      0.30        51
         9.0       0.08      0.40      0.13         5

    accuracy                           0.33       438
   macro avg       0.24      0.32      0.23       438
weighted avg       0.52      0.33      0.37       438

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on training data [0m
---------------------------------------------------------------------------------
    4   5   6   7   8   9
4   1   6   0   1   0   0
5   1