Importy

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split


Wczytanie danych

In [None]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/acute/diagnosis.data',header=None,encoding="utf16",delimiter="\t",
                   names=["Temperature","NauseaOccurance","LumbarPain","UrinePushing","MicturitionPains","UrethraBurning",
                          "UrinaryBladderInflammation","Nephritis"])

data.head()

Zastąpienie wartości typu string na integery (1 dla 'yes', 0 dla 'no') 

In [None]:

data = data.replace("yes",1)
data = data.replace("no",0)
data["Temperature"] = data["Temperature"].str.replace(",",".").astype(float)

data.head()

Aby prawidłowo sklasyfikować próbki musimy rozpatrzeć 4 możliowści wystepowania chorób. W tym celu zmapowaliśmy ostatnie dwie kolumny dotyczace tych chorób na cztery wartości wg wzoru `2*wartość z kolumny UrinaryBladderInflammation + wartość z kolumny Nephritis`

- nie występuje żadna choroba -> 0
-zapalenie nerek -> 1
-zapalenie pęcherza -> 2
-obie choroby -> 3



In [None]:
X_train, X_test, col_train, col_test = train_test_split(data.iloc[:,:6],data.iloc[:,-2:], test_size=0.20, random_state=42)

y_train = 2*col_train.iloc[:,0] + col_train.iloc[:,1]
y_test = 2*col_test.iloc[:,0] + col_test.iloc[:,1]

clf = SVC()

for C in [1.0, 100.0, 0.01]:
  clf.set_params(C=C, kernel='linear')
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print("ACC [linear, C=", C, "] = ", accuracy_score(y_test, y_pred))


  clf.kernel = 'rbf'
  for gamma in ['scale', 'auto', 1.0, 10.0, 0.1]:
    clf.set_params(C=C, kernel='rbf', gamma=gamma)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("ACC [rbf, C=", C, ", gamma=", gamma, "] = ", accuracy_score(y_test, y_pred))

    clf.set_params(C=C, kernel='poly')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("ACC [poly, C=", C, ", gamma=", gamma, "] = ", accuracy_score(y_test, y_pred))


Walidacja krzyżowa dla wybranych hiperparametrów i wyliczenie przeciętnej dokładności klasyfikatora:

In [None]:
kf = KFold(n_splits=10, shuffle=True)

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
errors_valid = []

for train_idx, valid_idx in kf.split(X_train):
  X_t_f, X_v_f = X_train.iloc[train_idx], X_train.iloc[valid_idx]
  y_t_f, y_v_f = y_train.iloc[train_idx], y_train.iloc[valid_idx]

  
  reg = SVC(C=100, kernel='rbf', gamma=0.1)
  reg.fit(X_t_f, y_t_f)
  errors_valid.append(accuracy_score(y_v_f, reg.predict(X_v_f)))
  
print(sum(errors_valid)/kf.n_splits) #przeciętna dokładność


Znalezienie optymalnych hiperparametrów na siatce:

In [None]:
parameters = {'C': [0.1, 0.5 , 1.0, 5.0, 10.0], 'kernel':['linear', 'poly', 'rbf', 'sigmoid'], 'gamma':['scale', 'auto']}
svc_reg = SVC(gamma='auto')
gcv_reg = GridSearchCV(svc_reg, parameters, scoring='accuracy')
gcv_reg.fit(X_train, y_train)
print(gcv_reg.best_params_)

print("Accuracy test:  ", accuracy_score(y_test, gcv_reg.predict(X_test)))
print(gcv_reg.best_score_)

Wyrysowanie macierzy pomyłek dla najlpszego zestawu parametrów:

In [None]:
clf1 = SVC(C=0.1, kernel='poly', gamma='auto')
clf1.fit(X_train, y_train)
y_pred1 = clf1.predict(X_test)
target = 2*data.iloc[:,-2] + data.iloc[:,-1]
confusion_matrix(y_test, y_pred1) 
plot_confusion_matrix(clf1,X_test,y_test)