## SVM

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from termcolor import colored
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.combine import SMOTEENN
from joblib import load

In [2]:
data=pd.read_csv("C:/Users/Administrator/2023_Data_Mining_Coffee_Quality_Dataset/cleaned_data.csv")

In [3]:
data.drop("Unnamed: 0",axis=1,inplace=True)

In [4]:
Y=data["Total.Cup.Points"]
X=data.drop("Total.Cup.Points",axis=1)
feature_names=X.columns
X.head()

Unnamed: 0,Species,Country.of.Origin,Harvest.Year,Variety,Processing.Method,Category.One.Defects,Quakers,Color,Category.Two.Defects,altitude_mean_meters
0,0,8,5,5,4,0,0.0,2,0,2075.0
1,0,8,5,15,4,0,0.0,2,1,2075.0
2,0,9,3,2,4,0,0.0,2,0,1700.0
3,0,8,5,5,0,0,0.0,2,2,2000.0
4,0,8,5,15,4,0,0.0,2,2,2075.0


In [5]:
X_train, X_test, Y_train, Y_test = load('C:/Users/Administrator/2023_Data_Mining_Coffee_Quality_Dataset/classification_data.joblib')
print(X_train.shape, X_test.shape)

(888, 10) (438, 10)


In [6]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
def resample(sampler, X, Y):
    X_resampled, y_resampled = sampler.fit_resample(X, Y)
    print(colored("Number of instances before resampling with {} : {}. ".format(type(sampler).__name__,\
                                                                           Counter(Y).items()), "green"))
    print(colored("Number of instances after  resampling with {} : {}. ".format(type(sampler).__name__,\
                                                                           Counter(y_resampled).items()), "blue"))
    return X_resampled, y_resampled
    

In [8]:
def report_imbalanced(model, x, y, text = "training"):
    y_pred = model.predict(x)
    
    print(colored("Classification report for model {} on {} data".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(classification_report_imbalanced(y, y_pred, zero_division=True))
    print("---------------------------------------------------------------------------------")
    
    print(colored("Confusion matrix for model {} on {} data ".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(pd.DataFrame(confusion_matrix(y, y_pred),columns=[4,5,6,7,8,9],index=[4,5,6,7,8,9]))
    print("---------------------------------------------------------------------------------")

In [None]:
from sklearn.metrics import classification_report
#from termcolor import colored

def report(model, x, y, text = "training"):
    y_pred = model.predict(x)
    
    print(colored("Classification report for model {} on {} data".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(classification_report(y, y_pred, zero_division=True))
    print("---------------------------------------------------------------------------------")
    
    print(colored("Confusion matrix for model {} on {} data ".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(pd.DataFrame(confusion_matrix(y, y_pred), columns=[4,5,6,7,8,9], index=[4,5,6,7,8,9]))
    print("---------------------------------------------------------------------------------")

In [9]:
#Balansiranje klasa koristeci SMOTE
smoteenn = SMOTEENN(random_state=42)
X_resampled, y_resampled = resample(smoteenn, X_train, Y_train)

[32mNumber of instances before resampling with SMOTEENN : dict_items([(8.0, 104), (7.0, 532), (6.0, 194), (4.0, 15), (9.0, 11), (5.0, 32)]). [0m
[34mNumber of instances after  resampling with SMOTEENN : dict_items([(4.0, 467), (5.0, 425), (6.0, 192), (7.0, 123), (8.0, 296), (9.0, 488)]). [0m


In [10]:
params = [
            {
                'kernel': ['linear'],
                'C': [0.01, 0.1, 1, 10,100],
            },
            {
                'kernel': ['rbf'],
                'C': [0.01, 0.1, 1, 10,100],
                'gamma': [0.01, 0.1, 1, 10],
            },
         ]

In [11]:
gs = GridSearchCV(SVC(probability=True), params, cv=3, verbose=5)

In [12]:
gs.fit(X_resampled, y_resampled)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[CV 1/3] END .............C=0.01, kernel=linear;, score=0.462 total time=   1.2s
[CV 2/3] END .............C=0.01, kernel=linear;, score=0.393 total time=   1.1s
[CV 3/3] END .............C=0.01, kernel=linear;, score=0.406 total time=   1.1s
[CV 1/3] END ..............C=0.1, kernel=linear;, score=0.492 total time=   1.0s
[CV 2/3] END ..............C=0.1, kernel=linear;, score=0.492 total time=   0.9s
[CV 3/3] END ..............C=0.1, kernel=linear;, score=0.490 total time=   1.0s
[CV 1/3] END ................C=1, kernel=linear;, score=0.627 total time=   0.8s
[CV 2/3] END ................C=1, kernel=linear;, score=0.627 total time=   0.6s
[CV 3/3] END ................C=1, kernel=linear;, score=0.615 total time=   0.4s
[CV 1/3] END ...............C=10, kernel=linear;, score=0.679 total time=   0.6s
[CV 2/3] END ...............C=10, kernel=linear;, score=0.678 total time=   0.6s
[CV 3/3] END ...............C=10, kernel=linear;, score=0.682 total time=   0.6s
[CV 1/3] END ..............C

In [13]:
gs.best_params_

{'C': 100, 'gamma': 10, 'kernel': 'rbf'}

In [14]:
gs.best_score_

0.9668523263491372

In [15]:
report(gs.best_estimator_,X_resampled,y_resampled)

[32mClassification report for model SVC on training data[0m
---------------------------------------------------------------------------------
                   pre       rec       spe        f1       geo       iba       sup

        4.0       1.00      1.00      1.00      1.00      1.00      1.00       467
        5.0       1.00      1.00      1.00      1.00      1.00      1.00       425
        6.0       1.00      0.99      1.00      1.00      1.00      0.99       192
        7.0       0.99      0.99      1.00      0.99      1.00      0.99       123
        8.0       0.99      0.98      1.00      0.98      0.99      0.97       296
        9.0       0.99      1.00      1.00      0.99      1.00      1.00       488

avg / total       0.99      0.99      1.00      0.99      1.00      0.99      1991

---------------------------------------------------------------------------------
[32mConfusion matrix for model SVC on training data [0m
-------------------------------------------------

In [16]:
report(gs.best_estimator_,X_test,Y_test,"test")

[32mClassification report for model SVC on training data[0m
---------------------------------------------------------------------------------
                   pre       rec       spe        f1       geo       iba       sup

        4.0       0.03      0.12      0.92      0.05      0.34      0.11         8
        5.0       0.15      0.50      0.89      0.23      0.67      0.43        16
        6.0       0.23      0.20      0.81      0.21      0.40      0.15        95
        7.0       0.68      0.38      0.73      0.49      0.53      0.27       263
        8.0       0.22      0.39      0.82      0.28      0.57      0.31        51
        9.0       0.04      0.20      0.94      0.06      0.43      0.17         5

avg / total       0.49      0.34      0.77      0.38      0.50      0.25       438

---------------------------------------------------------------------------------
[32mConfusion matrix for model SVC on training data [0m
-------------------------------------------------

In [17]:
import joblib
gs.best_estimator_.fit(X_resampled,y_resampled)
joblib.dump(gs.best_estimator_, "C:/Users/Administrator/2023_Data_Mining_Coffee_Quality_Dataset/models/classification/svm.pkl")

['C:/Users/Administrator/2023_Data_Mining_Coffee_Quality_Dataset/models/classification/svm.pkl']

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
hotencoder=OneHotEncoder(feature_name_combiner="concat")
encoder_df=pd.DataFrame(hotencoder.fit_transform(X[["Country.of.Origin","Variety","Processing.Method","Color"]]).toarray(),columns=hotencoder.get_feature_names_out())

In [None]:
X=X.join(encoder_df)
X.drop(labels=["Country.of.Origin","Variety","Processing.Method","Color"],axis=1,inplace=True)

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, Y, test_size=0.33, random_state=42)
scaler = StandardScaler()
scaler.fit(X_train1)
X_train1 = scaler.transform(X_train1)
X_test1 = scaler.transform(X_test1)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(2)
pca.fit(X_train1)
X_train1=pca.transform(X_train1)
X_test1=pca.transform(X_test1)

In [None]:
estimator_pca = GridSearchCV(SVC(probability=True), params, cv=3, verbose=5)

In [None]:
estimator_pca.fit(X_train1, y_train1)

In [None]:
print(estimator_pca.best_params_)
print(estimator_pca.best_score_)

In [None]:
report_imbalanced(estimator_pca.best_estimator_,X_train1,y_train1,)

In [None]:
report_imbalanced(estimator_pca.best_estimator_,X_test1,y_test1,"test")