In [60]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

from imblearn.over_sampling import SMOTE

In [61]:
df = pd.read_csv("wine.csv", delimiter=";")
display(df)
del df["ash"]

#Norm
numeric = ["alcohol", "malic acid","alcalinity","magnesium","total phenols","flavanoids","nonflavanoid phenols","proanthocyanins", "color intensity", "hue", "od280/od315", "proline"]
for col in numeric:
    min = df[col].min()
    max = df[col].max()
    df[col] = (2*(df[col] - min) / (max - min)) -1
    
display(df)
X = df.loc[:, df.columns != "class"]
y = df["class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

Unnamed: 0,class,alcohol,malic acid,ash,alcalinity,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,od280/od315,proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


Unnamed: 0,class,alcohol,malic acid,alcalinity,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,od280/od315,proline
0,1,0.684211,-0.616601,-0.484536,0.239130,0.255172,0.147679,-0.433962,0.186120,-0.255973,-0.089431,0.941392,0.122682
1,1,0.142105,-0.588933,-0.938144,-0.347826,0.151724,0.021097,-0.509434,-0.451104,-0.470990,-0.073171,0.560440,0.101284
2,1,0.121053,-0.359684,-0.175258,-0.326087,0.255172,0.223629,-0.358491,0.514196,-0.249147,-0.105691,0.391941,0.293866
3,1,0.757895,-0.521739,-0.360825,-0.065217,0.979310,0.329114,-0.584906,0.116719,0.112628,-0.382114,0.597070,0.714693
4,1,0.163158,-0.268775,0.072165,0.043478,0.255172,-0.008439,-0.018868,-0.110410,-0.481229,-0.089431,0.216117,-0.348074
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,0.410526,0.940711,0.020619,-0.456522,-0.517241,-0.886076,0.471698,-0.589905,0.095563,-0.739837,-0.655678,-0.340942
174,3,0.247368,0.252964,0.278351,-0.304348,-0.434483,-0.827004,0.132075,-0.369085,0.027304,-0.642276,-0.787546,-0.326676
175,3,0.178947,0.399209,-0.030928,0.086957,-0.579310,-0.852321,0.132075,-0.406940,0.522184,-0.821138,-0.787546,-0.205421
176,3,0.126316,-0.268775,-0.030928,0.086957,-0.537931,-0.856540,0.509434,-0.337539,0.368601,-0.804878,-0.743590,-0.198288


In [62]:
c = [0.001,0.01,0.1,1,10]
gamma = [0.001,0.01,0.1,1,10]

scores_smote = pd.DataFrame({'C': [], 'gamma': [], 'accuracy': []})
for C in c:
    for g in gamma:
        acc = []
        clf = SVC(kernel='rbf', C = C, gamma = g)
        kf = StratifiedKFold(n_splits=5)
        for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train), 1):  
            y_train_fold = y_train.iloc[train_index]  
            X_train_fold = X_train.iloc[train_index]
            X_val_fold = X_train.iloc[val_index]
            y_val_fold = y_train.iloc[val_index]  
    
            sm = SMOTE()
            X_train_fold_oversampled, y_train_fold_oversampled = sm.fit_resample(X_train_fold, y_train_fold)
  
            clf.fit(X_train_fold_oversampled, y_train_fold_oversampled )  
            y_pred_fold = clf.predict(X_val_fold)
            acc.append(accuracy_score(y_val_fold, y_pred_fold))
        scores_smote = scores.append(pd.Series({'C': C, 'gamma': g, 'accuracy': np.mean(acc)}), ignore_index = True)
best_config = scores_smote.iloc[scores['accuracy'].idxmax()]
print(f"Best configuration on validation set:\n{best_config}")
best_c = best_config['C']
best_gamma = best_config['gamma']
    

Best configuration on validation set:
C           10.000000
gamma        1.000000
accuracy     0.975667
Name: 23, dtype: float64


In [63]:
best_clf = SVC(kernel='rbf', C=  best_config['C'], gamma = best_config['gamma'])

sm = SMOTE()
X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
best_clf.fit(X_train_oversampled, y_train_oversampled)

y_pred = best_clf.predict(X_test)
print("EVALUATION WITH SMOTE")
print("Accuracy %f"%best_clf.score(X_test, y_test))
print("Precision %f"%precision_score(y_test, y_pred,average="weighted",labels=np.unique(y_pred)))
print("Recall %f"%recall_score(y_test, y_pred,average="weighted",labels=np.unique(y_pred)))
print("F1-Score %f"%f1_score(y_test, y_pred,average="weighted",labels=np.unique(y_pred)))

EVALUATION WITH SMOTE
Accuracy 0.981481
Precision 0.982407
Recall 0.981481
F1-Score 0.981316


In [64]:
c = [0.001,0.01,0.1,1,10]
gamma = [0.001,0.01,0.1,1,10]

scores = pd.DataFrame({'C': [], 'gamma': [], 'accuracy': []})
for C in c:
    for g in gamma:
        acc = []
        clf = SVC(kernel='rbf', C = C, gamma = g)
        kf = StratifiedKFold(n_splits=5)
        for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train), 1):  
            y_train_fold = y_train.iloc[train_index]  
            X_train_fold = X_train.iloc[train_index]
            X_val_fold = X_train.iloc[val_index]
            y_val_fold = y_train.iloc[val_index]  
  
            clf.fit(X_train_fold, y_train_fold )  
            y_pred_fold = clf.predict(X_val_fold)
            acc.append(accuracy_score(y_val_fold, y_pred_fold))
        scores = scores.append(pd.Series({'C': C, 'gamma': g, 'accuracy': np.mean(acc)}), ignore_index = True)
best_config = scores.iloc[scores['accuracy'].idxmax()]
print(f"Best configuration on validation set:\n{best_config}")
best_c = best_config['C']
best_gamma = best_config['gamma']

Best configuration on validation set:
C           10.000000
gamma        1.000000
accuracy     0.975667
Name: 23, dtype: float64


In [65]:
best_clf = SVC(kernel='rbf', C=  best_config['C'], gamma = best_config['gamma'])
best_clf.fit(X_train, y_train)
y_pred = best_clf.predict(X_test)
print("EVALUATION WITHOUT SMOTE")
print("Accuracy %f"%accuracy_score(y_test, y_pred))
print("Precision %f"%precision_score(y_test, y_pred,average="weighted",labels=np.unique(y_pred)))
print("Recall %f"%recall_score(y_test, y_pred,average="weighted",labels=np.unique(y_pred)))
print("F1-Score %f"%f1_score(y_test, y_pred,average="weighted",labels=np.unique(y_pred)))

EVALUATION WITHOUT SMOTE
Accuracy 0.981481
Precision 0.982407
Recall 0.981481
F1-Score 0.981316
