In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
import numpy as np

path = "data/smoking.csv"
df = pd.read_csv(path)
df = df.replace({'gender':{'F':1,'M':0},'oral':{'Y':1,'N':0},'tartar':{'Y':1,'N':0}})
df.groupby("smoking").count()
df = df.groupby('smoking', group_keys=False).apply(pd.DataFrame.sample, frac=.05)
X, y = df.loc[:,df.columns!='smoking'].values, df['smoking'].values

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.75, random_state=0)
scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 1

In [2]:
lrgrid = {'C':[1],'fit_intercept':[True, False],'penalty':['elasticnet'],'solver':["saga"], "l1_ratio":[i/10 for i in range(0, 10+1)]}
lrgs = GridSearchCV(LogisticRegression(), lrgrid, cv=5).fit(X_train_scaled,y_train)
print("Geriausi parametrai: {}".format(lrgs.best_params_))
print("Rizika: ", 1 - lrgs.score(X_test_scaled, y_test))



Geriausi parametrai: {'C': 1, 'fit_intercept': True, 'l1_ratio': 0.4, 'penalty': 'elasticnet', 'solver': 'saga'}
Rizika:  0.2568149210903874




In [3]:
knngrid = {'n_neighbors':list(range(1, 10+1))}
knngs = GridSearchCV(KNeighborsClassifier(), knngrid, cv=5).fit(X_train_scaled,y_train)
print("Geriausi parametrai: {}".format(knngs.best_params_))
print("Rizika: ", 1 - knngs.score(X_test_scaled, y_test))

Geriausi parametrai: {'n_neighbors': 9}
Rizika:  0.29985652797704443


In [4]:
svcgrid = {'kernel':['rbf'],'gamma':[10**i for i in range(-3, 3+1)]}
svcgs = GridSearchCV(SVC(), svcgrid, cv=5).fit(X_train_scaled,y_train)
print("Geriausi parametrai: {}".format(svcgs.best_params_))
print("Rizika: ", 1 - svcgs.score(X_test_scaled, y_test))

Geriausi parametrai: {'gamma': 1, 'kernel': 'rbf'}
Rizika:  0.2453371592539455


In [5]:
### b)
print("Vid uztruko: ", knngs.cv_results_['mean_score_time'][9]) 

Vid uztruko:  0.018335771560668946


# 2

In [6]:
lr2grid = {'C':[1],'fit_intercept':[False],'penalty':['none','l2'],'solver':["saga"], "l1_ratio":[i/10 for i in range(0, 10+1)]}

In [7]:
one = LogisticRegression(penalty="none", fit_intercept = False, max_iter=10**6).fit(X_train_scaled, y_train)
one.score(X_test_scaled, y_test)

0.7489239598278336

In [8]:
two = LogisticRegression(C = 10**20,fit_intercept = False, max_iter=10**6).fit(X_train_scaled, y_train)
two.score(X_test_scaled, y_test)

0.7489239598278336

# 3

In [9]:
def kernel(xi, xj):
    xi = np.array(xi)
    xj = np.array(xj)
    step1 = np.sum(xi - xj)
    step2 = step1 ** 2
    step3 = 1 + step2
    step4 = 1 / step3
    return step4

def kmatrix(X1,X2):
    m = len(X1)
    n = len(X2)
    kmat = np.ndarray(shape=(m,n))
    for i in range(m):
        for j in range(n):
            kmat[i,j]=kernel(X1[i], X2[j])
    return kmat

In [10]:
kmat_train = kmatrix(X1=X_train_scaled,X2=X_train_scaled)

In [11]:
k_clf = SVC(C=0.9, kernel='precomputed').fit(kmat_train, y_train)

In [12]:
kmat_test = kmatrix(X1=X_test_scaled,X2=X_train_scaled)
ypred = k_clf.predict(kmat_test)

In [13]:
# Prognozes
ypred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,

In [16]:
print("Score: ", sum(ypred == y_test) / len(y_test))

Score:  0.629842180774749
