## SVM

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from termcolor import colored
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE

In [2]:
data=pd.read_csv("C:/Users/Administrator/2023_Data_Mining_Coffee_Quality_Dataset/cleaned_data.csv")

In [3]:
data.drop("Unnamed: 0",axis=1,inplace=True)

In [4]:
Y=data["Total.Cup.Points"]
X=data.drop("Total.Cup.Points",axis=1)
feature_names=X.columns
X.head()

Unnamed: 0,Species,Country.of.Origin,Region,Harvest.Year,Variety,Processing.Method,Category.One.Defects,Quakers,Color,Category.Two.Defects,altitude_mean_meters
0,0,8,115,5,5,4,0,0.0,2,0,2075.0
1,0,8,115,5,15,4,0,0.0,2,1,2075.0
2,0,9,122,3,2,4,0,0.0,2,0,1700.0
3,0,8,226,5,5,0,0,0.0,2,2,2000.0
4,0,8,115,5,15,4,0,0.0,2,2,2075.0


In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, stratify=Y, random_state=42)
print(X_train.shape, X_test.shape)

(928, 11) (398, 11)


In [6]:
def resample(sampler, X, Y):
    X_resampled, y_resampled = sampler.fit_resample(X, Y)
    print(colored("Number of instances before resampling with {} : {}. ".format(type(sampler).__name__,\
                                                                           Counter(Y).items()), "green"))
    print(colored("Number of instances after  resampling with {} : {}. ".format(type(sampler).__name__,\
                                                                           Counter(y_resampled).items()), "blue"))
    return X_resampled, y_resampled
    

In [7]:
def report_imbalanced(model, x, y, text = "training"):
    y_pred = model.predict(x)
    
    print(colored("Classification report for model {} on {} data".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(classification_report_imbalanced(y, y_pred, zero_division=True))
    print("---------------------------------------------------------------------------------")
    
    print(colored("Confusion matrix for model {} on {} data ".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(pd.DataFrame(confusion_matrix(y, y_pred),columns=[4,5,6,7,8,9],index=[4,5,6,7,8,9]))
    print("---------------------------------------------------------------------------------")

In [8]:
#Balansiranje klasa koristeci SMOTE
smote = SMOTE(k_neighbors=10, random_state=42)
X_resampled, y_resampled = resample(smote, X_train, Y_train)

[32mNumber of instances before resampling with SMOTE : dict_items([(7.0, 556), (6.0, 202), (8.0, 109), (4.0, 16), (9.0, 11), (5.0, 34)]). [0m
[34mNumber of instances after  resampling with SMOTE : dict_items([(7.0, 556), (6.0, 556), (8.0, 556), (4.0, 556), (9.0, 556), (5.0, 556)]). [0m


In [9]:
params = [
            {
                'kernel': ['linear'],
                'C': [0.01, 0.1, 1, 10],
            },
            {
                'kernel': ['rbf'],
                'C': [0.01, 0.1, 1, 10],
                'gamma': [0.01, 0.1, 1, 10],
            },
         ]

In [10]:
gs = GridSearchCV(SVC(), params, scoring='accuracy', cv=5, verbose=5)

In [11]:
gs.fit(X_resampled, y_resampled)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[CV 1/5] END .............C=0.01, kernel=linear;, score=0.493 total time= 2.8min
[CV 2/5] END .............C=0.01, kernel=linear;, score=0.546 total time= 3.5min
[CV 3/5] END .............C=0.01, kernel=linear;, score=0.513 total time=  53.7s
[CV 4/5] END .............C=0.01, kernel=linear;, score=0.544 total time= 2.0min
[CV 5/5] END .............C=0.01, kernel=linear;, score=0.543 total time= 1.5min
[CV 1/5] END ..............C=0.1, kernel=linear;, score=0.494 total time= 5.0min
[CV 2/5] END ..............C=0.1, kernel=linear;, score=0.534 total time= 4.8min
[CV 3/5] END ..............C=0.1, kernel=linear;, score=0.511 total time= 5.7min
[CV 4/5] END ..............C=0.1, kernel=linear;, score=0.537 total time= 6.4min
[CV 5/5] END ..............C=0.1, kernel=linear;, score=0.547 total time= 5.7min
[CV 1/5] END ................C=1, kernel=linear;, score=0.482 total time=16.8min
[CV 2/5] END ................C=1, kernel=linear;, score=0.510 total time=11.7min
[CV 3/5] END ...............

In [12]:
gs.best_params_

{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}

In [13]:
gs.best_score_

0.6960409017048362

In [14]:
report_imbalanced(gs.best_estimator_,X_resampled,y_resampled)

[32mClassification report for model SVC on training data[0m
---------------------------------------------------------------------------------
                   pre       rec       spe        f1       geo       iba       sup

        4.0       1.00      1.00      1.00      1.00      1.00      1.00       556
        5.0       0.95      0.99      0.99      0.97      0.99      0.99       556
        6.0       0.96      0.92      0.99      0.94      0.96      0.91       556
        7.0       0.93      0.94      0.99      0.94      0.96      0.93       556
        8.0       0.98      0.96      1.00      0.97      0.98      0.95       556
        9.0       0.99      1.00      1.00      0.99      1.00      1.00       556

avg / total       0.97      0.97      0.99      0.97      0.98      0.96      3336

---------------------------------------------------------------------------------
[32mConfusion matrix for model SVC on training data [0m
-------------------------------------------------

In [15]:
report_imbalanced(gs.best_estimator_,X_test,Y_test)

[32mClassification report for model SVC on training data[0m
---------------------------------------------------------------------------------
                   pre       rec       spe        f1       geo       iba       sup

        4.0       0.00      0.00      0.97      1.00      0.00      0.00         7
        5.0       0.15      0.21      0.96      0.18      0.45      0.19        14
        6.0       0.31      0.22      0.86      0.26      0.43      0.18        87
        7.0       0.67      0.69      0.48      0.68      0.58      0.34       239
        8.0       0.34      0.37      0.91      0.35      0.58      0.32        46
        9.0       0.43      0.60      0.99      0.50      0.77      0.57         5

avg / total       0.52      0.52      0.64      0.54      0.53      0.29       398

---------------------------------------------------------------------------------
[32mConfusion matrix for model SVC on training data [0m
-------------------------------------------------