## SVM

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from termcolor import colored
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.combine import SMOTEENN

In [2]:
data=pd.read_csv("C:/Users/Administrator/2023_Data_Mining_Coffee_Quality_Dataset/cleaned_data.csv")

In [3]:
data.drop("Unnamed: 0",axis=1,inplace=True)

In [4]:
Y=data["Total.Cup.Points"]
X=data.drop("Total.Cup.Points",axis=1)
feature_names=X.columns
X.head()

Unnamed: 0,Species,Country.of.Origin,Harvest.Year,Variety,Processing.Method,Category.One.Defects,Quakers,Color,Category.Two.Defects,altitude_mean_meters
0,0,8,5,5,4,0,0.0,2,0,2075.0
1,0,8,5,15,4,0,0.0,2,1,2075.0
2,0,9,3,2,4,0,0.0,2,0,1700.0
3,0,8,5,5,0,0,0.0,2,2,2000.0
4,0,8,5,15,4,0,0.0,2,2,2075.0


In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, stratify=Y, random_state=42)
print(X_train.shape, X_test.shape)

(928, 10) (398, 10)


In [6]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X_train)
scaler.transform(X_train)
scaler.transform(X_test)

array([[-0.16293763, -0.44826192,  1.82750509, ...,  0.37946902,
        -0.67817354, -0.0724196 ],
       [-0.16293763, -1.31711398, -0.87387229, ...,  0.37946902,
         1.32292358, -0.08905806],
       [-0.16293763,  0.51712926, -0.87387229, ...,  0.37946902,
        -0.0778444 , -0.06114119],
       ...,
       [-0.16293763,  0.51712926, -0.87387229, ...,  0.37946902,
        -0.27795411, -0.08905806],
       [-0.16293763, -0.44826192,  0.74695413, ...,  0.37946902,
        -0.47806383, -0.08913991],
       [-0.16293763, -1.31711398,  1.28722961, ...,  0.37946902,
        -0.67817354, -0.24246122]])

In [7]:
def resample(sampler, X, Y):
    X_resampled, y_resampled = sampler.fit_resample(X, Y)
    print(colored("Number of instances before resampling with {} : {}. ".format(type(sampler).__name__,\
                                                                           Counter(Y).items()), "green"))
    print(colored("Number of instances after  resampling with {} : {}. ".format(type(sampler).__name__,\
                                                                           Counter(y_resampled).items()), "blue"))
    return X_resampled, y_resampled
    

In [8]:
def report_imbalanced(model, x, y, text = "training"):
    y_pred = model.predict(x)
    
    print(colored("Classification report for model {} on {} data".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(classification_report_imbalanced(y, y_pred, zero_division=True))
    print("---------------------------------------------------------------------------------")
    
    print(colored("Confusion matrix for model {} on {} data ".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(pd.DataFrame(confusion_matrix(y, y_pred),columns=[4,5,6,7,8,9],index=[4,5,6,7,8,9]))
    print("---------------------------------------------------------------------------------")

In [10]:
#Balansiranje klasa koristeci SMOTE
smoteenn = SMOTEENN(random_state=42)
X_resampled, y_resampled = resample(smoteenn, X_train, Y_train)

[32mNumber of instances before resampling with SMOTEENN : dict_items([(7.0, 556), (6.0, 202), (8.0, 109), (4.0, 16), (9.0, 11), (5.0, 34)]). [0m
[34mNumber of instances after  resampling with SMOTEENN : dict_items([(4.0, 441), (5.0, 392), (6.0, 241), (7.0, 126), (8.0, 302), (9.0, 469)]). [0m


In [11]:
params = [
            {
                'kernel': ['linear'],
                'C': [0.01, 0.1, 1, 10,100],
            },
            {
                'kernel': ['rbf'],
                'C': [0.01, 0.1, 1, 10,100],
                'gamma': [0.01, 0.1, 1, 10],
            },
         ]

In [12]:
gs = GridSearchCV(SVC(), params, scoring='accuracy', cv=3, verbose=5)

In [13]:
gs.fit(X_resampled, y_resampled)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV 1/3] END .............C=0.01, kernel=linear;, score=0.709 total time=   6.8s
[CV 2/3] END .............C=0.01, kernel=linear;, score=0.636 total time=   7.0s
[CV 3/3] END .............C=0.01, kernel=linear;, score=0.648 total time=  50.6s
[CV 1/3] END ..............C=0.1, kernel=linear;, score=0.702 total time=  51.6s
[CV 2/3] END ..............C=0.1, kernel=linear;, score=0.664 total time=  48.6s
[CV 3/3] END ..............C=0.1, kernel=linear;, score=0.686 total time= 6.5min
[CV 1/3] END ................C=1, kernel=linear;, score=0.648 total time= 3.5min
[CV 2/3] END ................C=1, kernel=linear;, score=0.688 total time= 3.9min
[CV 3/3] END ................C=1, kernel=linear;, score=0.668 total time= 6.4min
[CV 1/3] END ...............C=10, kernel=linear;, score=0.615 total time= 7.5min
[CV 2/3] END ...............C=10, kernel=linear;, score=0.676 total time=11.3min
[CV 3/3] END ...............C=10, kernel=linear;

In [14]:
gs.best_params_

{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}

In [15]:
gs.best_score_

0.9690512430238458

In [16]:
report_imbalanced(gs.best_estimator_,X_resampled,y_resampled)

[32mClassification report for model SVC on training data[0m
---------------------------------------------------------------------------------
                   pre       rec       spe        f1       geo       iba       sup

        4.0       1.00      1.00      1.00      1.00      1.00      1.00       441
        5.0       1.00      1.00      1.00      1.00      1.00      1.00       392
        6.0       1.00      1.00      1.00      1.00      1.00      1.00       241
        7.0       1.00      1.00      1.00      1.00      1.00      1.00       126
        8.0       1.00      1.00      1.00      1.00      1.00      1.00       302
        9.0       1.00      1.00      1.00      1.00      1.00      1.00       469

avg / total       1.00      1.00      1.00      1.00      1.00      1.00      1971

---------------------------------------------------------------------------------
[32mConfusion matrix for model SVC on training data [0m
-------------------------------------------------

In [17]:
report_imbalanced(gs.best_estimator_,X_test,Y_test)

[32mClassification report for model SVC on training data[0m
---------------------------------------------------------------------------------
                   pre       rec       spe        f1       geo       iba       sup

        4.0       0.02      0.29      0.76      0.04      0.47      0.21         7
        5.0       0.10      0.36      0.88      0.16      0.56      0.30        14
        6.0       0.23      0.22      0.80      0.22      0.42      0.16        87
        7.0       0.69      0.23      0.85      0.34      0.44      0.18       239
        8.0       0.18      0.30      0.82      0.22      0.50      0.24        46
        9.0       0.31      0.80      0.98      0.44      0.88      0.77         5

avg / total       0.49      0.25      0.84      0.29      0.45      0.19       398

---------------------------------------------------------------------------------
[32mConfusion matrix for model SVC on training data [0m
-------------------------------------------------