## SVM

In [166]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from termcolor import colored
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.combine import SMOTEENN
from joblib import load

In [167]:
data=pd.read_csv("C:/Users/Administrator/2023_Data_Mining_Coffee_Quality_Dataset/cleaned_data.csv")

In [168]:
data.drop("Unnamed: 0",axis=1,inplace=True)

In [169]:
Y=data["Total.Cup.Points"]
X=data.drop("Total.Cup.Points",axis=1)
feature_names=X.columns
X.head()

Unnamed: 0,Species,Country.of.Origin,Harvest.Year,Variety,Processing.Method,Category.One.Defects,Quakers,Color,Category.Two.Defects,altitude_mean_meters
0,0,8,5,5,4,0,0.0,2,0,2075.0
1,0,8,5,15,4,0,0.0,2,1,2075.0
2,0,9,3,2,4,0,0.0,2,0,1700.0
3,0,8,5,5,0,0,0.0,2,2,2000.0
4,0,8,5,15,4,0,0.0,2,2,2075.0


In [170]:
X_train, X_test, Y_train, Y_test = load('C:/Users/Administrator/2023_Data_Mining_Coffee_Quality_Dataset/classification_data.joblib')
print(X_train.shape, X_test.shape)

(888, 10) (438, 10)


In [171]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [172]:
def resample(sampler, X, Y):
    X_resampled, y_resampled = sampler.fit_resample(X, Y)
    print(colored("Number of instances before resampling with {} : {}. ".format(type(sampler).__name__,\
                                                                           Counter(Y).items()), "green"))
    print(colored("Number of instances after  resampling with {} : {}. ".format(type(sampler).__name__,\
                                                                           Counter(y_resampled).items()), "blue"))
    return X_resampled, y_resampled
    

In [173]:
def report_imbalanced(model, x, y, text = "training"):
    y_pred = model.predict(x)
    
    print(colored("Classification report for model {} on {} data".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(classification_report_imbalanced(y, y_pred, zero_division=True))
    print("---------------------------------------------------------------------------------")
    
    print(colored("Confusion matrix for model {} on {} data ".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(pd.DataFrame(confusion_matrix(y, y_pred),columns=[4,5,6,7,8,9],index=[4,5,6,7,8,9]))
    print("---------------------------------------------------------------------------------")

In [174]:
from sklearn.metrics import classification_report
#from termcolor import colored

def report(model, x, y, text = "training"):
    y_pred = model.predict(x)
    
    print(colored("Classification report for model {} on {} data".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(classification_report(y, y_pred, zero_division=True))
    print("---------------------------------------------------------------------------------")
    
    print(colored("Confusion matrix for model {} on {} data ".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(pd.DataFrame(confusion_matrix(y, y_pred), columns=[4,5,6,7,8,9], index=[4,5,6,7,8,9]))
    print("---------------------------------------------------------------------------------")

In [175]:
#Balansiranje klasa koristeci SMOTE
smoteenn = SMOTEENN(random_state=42)
X_resampled, y_resampled = resample(smoteenn, X_train, Y_train)

[32mNumber of instances before resampling with SMOTEENN : dict_items([(8.0, 104), (7.0, 532), (6.0, 194), (4.0, 15), (9.0, 11), (5.0, 32)]). [0m
[34mNumber of instances after  resampling with SMOTEENN : dict_items([(4.0, 467), (5.0, 425), (6.0, 192), (7.0, 123), (8.0, 296), (9.0, 488)]). [0m


In [176]:
params = [
            {
                'kernel': ['linear'],
                'C': [0.01, 0.1, 1, 10,100],
            },
            {
                'kernel': ['rbf'],
                'C': [0.01, 0.1, 1, 10,100],
                'gamma': [0.01, 0.1, 1, 10],
            },
         ]

In [177]:
gs = GridSearchCV(SVC(probability=True), params, cv=3, verbose=5)

In [178]:
gs.fit(X_resampled, y_resampled)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[CV 1/3] END .............C=0.01, kernel=linear;, score=0.462 total time=   0.4s
[CV 2/3] END .............C=0.01, kernel=linear;, score=0.393 total time=   0.4s
[CV 3/3] END .............C=0.01, kernel=linear;, score=0.406 total time=   0.5s
[CV 1/3] END ..............C=0.1, kernel=linear;, score=0.492 total time=   0.3s
[CV 2/3] END ..............C=0.1, kernel=linear;, score=0.492 total time=   0.4s
[CV 3/3] END ..............C=0.1, kernel=linear;, score=0.490 total time=   0.4s
[CV 1/3] END ................C=1, kernel=linear;, score=0.627 total time=   0.4s
[CV 2/3] END ................C=1, kernel=linear;, score=0.627 total time=   0.4s
[CV 3/3] END ................C=1, kernel=linear;, score=0.615 total time=   0.4s
[CV 1/3] END ...............C=10, kernel=linear;, score=0.679 total time=   0.6s
[CV 2/3] END ...............C=10, kernel=linear;, score=0.678 total time=   0.5s
[CV 3/3] END ...............C=10, kernel=linear;, score=0.682 total time=   0.5s
[CV 1/3] END ..............C

In [179]:
gs.best_params_

{'C': 100, 'gamma': 10, 'kernel': 'rbf'}

In [180]:
gs.best_score_

0.9668523263491372

In [181]:
report(gs.best_estimator_,X_resampled,y_resampled)

[32mClassification report for model SVC on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

         4.0       1.00      1.00      1.00       467
         5.0       1.00      1.00      1.00       425
         6.0       1.00      0.99      1.00       192
         7.0       0.99      0.99      0.99       123
         8.0       0.99      0.98      0.98       296
         9.0       0.99      1.00      0.99       488

    accuracy                           0.99      1991
   macro avg       1.00      0.99      0.99      1991
weighted avg       0.99      0.99      0.99      1991

---------------------------------------------------------------------------------
[32mConfusion matrix for model SVC on training data [0m
---------------------------------------------------------------------------------
     4    5    6    7    8    9
4  467    0    0    0    0    0
5    1  424    0    0    0 

In [182]:
report(gs.best_estimator_,X_test,Y_test,"test")

[32mClassification report for model SVC on test data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

         4.0       0.03      0.12      0.05         8
         5.0       0.15      0.50      0.23        16
         6.0       0.23      0.20      0.21        95
         7.0       0.68      0.38      0.49       263
         8.0       0.22      0.39      0.28        51
         9.0       0.04      0.20      0.06         5

    accuracy                           0.34       438
   macro avg       0.22      0.30      0.22       438
weighted avg       0.49      0.34      0.38       438

---------------------------------------------------------------------------------
[32mConfusion matrix for model SVC on test data [0m
---------------------------------------------------------------------------------
    4   5   6    7   8   9
4   1   4   1    2   0   0
5   1   8   1    4   2   0
6  15  13  19   28

In [183]:
import joblib
gs.best_estimator_.fit(X_resampled,y_resampled)
joblib.dump(gs.best_estimator_, "C:/Users/Administrator/2023_Data_Mining_Coffee_Quality_Dataset/models/classification/svm.pkl")

['C:/Users/Administrator/2023_Data_Mining_Coffee_Quality_Dataset/models/classification/svm.pkl']

In [184]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
hotencoder=OneHotEncoder(feature_name_combiner="concat")
encoder_df=pd.DataFrame(hotencoder.fit_transform(X[["Country.of.Origin","Variety","Processing.Method","Color"]]).toarray(),columns=hotencoder.get_feature_names_out())

In [185]:
X=X.join(encoder_df)
X.drop(labels=["Country.of.Origin","Variety","Processing.Method","Color"],axis=1,inplace=True)

In [186]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, Y, test_size=0.33, random_state=42)
scaler = StandardScaler()
scaler.fit(X_train1)
X_train1 = scaler.transform(X_train1)
X_test1 = scaler.transform(X_test1)

In [187]:

X_resampled1, y_resampled1 = resample(smoteenn, X_train1, y_train1)

[32mNumber of instances before resampling with SMOTEENN : dict_items([(6.0, 197), (7.0, 517), (5.0, 33), (9.0, 15), (8.0, 109), (4.0, 17)]). [0m
[34mNumber of instances after  resampling with SMOTEENN : dict_items([(4.0, 467), (5.0, 425), (6.0, 206), (7.0, 133), (8.0, 300), (9.0, 491)]). [0m


In [188]:
X_resampled1

array([[-1.43838990e-01, -8.43889511e-01,  2.96282776e+00, ...,
        -2.61900855e-01, -2.85670781e-01,  4.02869290e-01],
       [-1.43838990e-01, -8.43889511e-01, -1.80948057e-01, ...,
        -2.61900855e-01, -2.85670781e-01,  4.02869290e-01],
       [-1.43838990e-01, -8.43889511e-01,  7.85314569e+00, ...,
        -2.61900855e-01,  3.50053301e+00, -2.48219466e+00],
       ...,
       [-1.43838990e-01,  2.49289024e-01, -1.80948057e-01, ...,
        -2.61900855e-01, -2.85670781e-01,  4.02869290e-01],
       [-1.43838990e-01, -7.79285452e-03, -1.80948057e-01, ...,
        -2.61900855e-01,  1.49512595e+00, -9.54086844e-01],
       [-1.43838990e-01, -6.41710982e-01, -1.80948057e-01, ...,
        -2.61900855e-01, -2.85670781e-01,  4.02869290e-01]])

In [189]:
from sklearn.decomposition import PCA
pca = PCA(2)
pca.fit(X_resampled1)
X_train1=pca.transform(X_resampled1)
X_test1=pca.transform(X_test1)

In [190]:
X_test1.shape

(438, 2)

In [191]:
estimator_pca = GridSearchCV(SVC(probability=True), params, cv=3, verbose=5)

In [192]:
estimator_pca.fit(X_train1, y_resampled1)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[CV 1/3] END .............C=0.01, kernel=linear;, score=0.567 total time=   0.3s
[CV 2/3] END .............C=0.01, kernel=linear;, score=0.555 total time=   0.3s
[CV 3/3] END .............C=0.01, kernel=linear;, score=0.564 total time=   0.2s
[CV 1/3] END ..............C=0.1, kernel=linear;, score=0.564 total time=   0.2s
[CV 2/3] END ..............C=0.1, kernel=linear;, score=0.546 total time=   0.2s
[CV 3/3] END ..............C=0.1, kernel=linear;, score=0.568 total time=   0.3s
[CV 1/3] END ................C=1, kernel=linear;, score=0.583 total time=   0.4s
[CV 2/3] END ................C=1, kernel=linear;, score=0.588 total time=   0.3s
[CV 3/3] END ................C=1, kernel=linear;, score=0.604 total time=   0.3s
[CV 1/3] END ...............C=10, kernel=linear;, score=0.585 total time=   0.6s
[CV 2/3] END ...............C=10, kernel=linear;, score=0.591 total time=   0.6s
[CV 3/3] END ...............C=10, kernel=linear;, score=0.607 total time=   0.6s
[CV 1/3] END ..............C

In [193]:
print(estimator_pca.best_params_)
print(estimator_pca.best_score_)

{'C': 100, 'gamma': 10, 'kernel': 'rbf'}
0.7334322453016816


In [194]:
#Pomocna fja 
from sklearn.metrics import classification_report
from termcolor import colored

def report(model, x, y, text = "training"):
    y_pred = model.predict(x)
    
    print(colored("Classification report for model {} on {} data".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(classification_report(y, y_pred, zero_division=True))
    print("---------------------------------------------------------------------------------")
    
    print(colored("Confusion matrix for model {} on {} data ".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(pd.DataFrame(confusion_matrix(y, y_pred), columns=[4,5,6,7,8,9], index=[4,5,6,7,8,9]))
    print("---------------------------------------------------------------------------------")

In [198]:
report(estimator_pca.best_estimator_,X_train1,y_resampled1,)

[32mClassification report for model SVC on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

         4.0       0.89      0.88      0.89       467
         5.0       0.83      0.77      0.80       425
         6.0       0.75      0.59      0.66       206
         7.0       0.78      0.47      0.58       133
         8.0       0.64      0.93      0.76       300
         9.0       0.97      0.97      0.97       491

    accuracy                           0.83      2022
   macro avg       0.81      0.77      0.78      2022
weighted avg       0.84      0.83      0.83      2022

---------------------------------------------------------------------------------
[32mConfusion matrix for model SVC on training data [0m
---------------------------------------------------------------------------------
     4    5    6   7    8    9
4  413   20    7   2   25    0
5   29  327   14   3   47    

In [196]:
X_test1.shape

(438, 2)

In [197]:
report(estimator_pca.best_estimator_,X_test1,y_test1,"test")

[32mClassification report for model SVC on test data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

         4.0       0.00      0.00      1.00         6
         5.0       0.05      0.33      0.09        15
         6.0       0.22      0.14      0.17        92
         7.0       0.60      0.12      0.20       278
         8.0       0.15      0.54      0.23        46
         9.0       0.05      1.00      0.09         1

    accuracy                           0.18       438
   macro avg       0.18      0.36      0.30       438
weighted avg       0.44      0.18      0.20       438

---------------------------------------------------------------------------------
[32mConfusion matrix for model SVC on test data [0m
---------------------------------------------------------------------------------
    4   5   6   7    8  9
4   0   5   0   0    1  0
5   4   5   2   0    3  1
6  16  21  13  18   2