In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import sklearn.model_selection
from HW1.logit import Logit

In [2]:
def read_dataset(path):
    data = pd.read_csv(path)
    X = data.iloc[:,:-1].values
    y = data.iloc[:, -1].apply(lambda c: 1 if c == 'P' else -1).values
    return X, y

In [3]:
def calc_f_score(X, y, alpha):
    n_splits = 5
    cv = sklearn.model_selection.KFold(n_splits=n_splits, shuffle=True)
    mean_f_score = 0.0
    for train_indexes, test_indexes in cv.split(X):
        X_train = X[train_indexes]
        X_test = X[test_indexes]
        y_train = y[train_indexes]
        y_test = y[test_indexes]

        classifier = Logit(alpha)
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)

        tp = np.sum((y_pred == 1) & (y_test == 1))
        fp = np.sum((y_pred == 1) & (y_test != 1))
        tn = np.sum((y_pred != 1) & (y_test != 1))
        fn = np.sum((y_pred != 1) & (y_test == 1))

        if tp != 0:
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f_score = 2 * precision * recall / (precision + recall)
            mean_f_score += f_score
    return mean_f_score / n_splits

In [4]:
X, y = read_dataset('data/geyser.csv')
for alpha in [0.01, 0.1, 1.,10., 100.]:
    print('alpha =', alpha, 'f-score =', calc_f_score(X, y, alpha))

iters = 10000 0.42895790385121346 [ 0.06377694 -1.18175993  2.74974704] [ 5.31782697e-05  2.03249846e-03 -8.39659197e-03] 0.008639249653297884
iters = 20000 0.42728472241191157 [ 0.06170442 -1.26425345  3.08955294] [ 5.59071574e-06  2.31879626e-04 -9.52558494e-04] 0.0009803913001323592
iters = 30000 0.4272626347044096 [ 0.0614753  -1.2738014   3.12876433] [ 6.50459898e-07  2.72358000e-05 -1.11819059e-04] 0.00011509002548190285
iters = 40000 0.42726232941999887 [ 0.06144849 -1.27492484  3.13337655] [ 7.65834576e-08  3.21026110e-06 -1.31791258e-05] 1.356469674104025e-05
iters = 50000 0.4272623251777109 [ 0.06144533 -1.27505729  3.13392028] [ 9.02937729e-09  3.78547568e-07 -1.55404374e-06] 1.5995098430201826e-06
iters = 60000 0.4272623251187219 [ 0.06144496 -1.2750729   3.1339844 ] [ 1.06476119e-09  4.46397528e-08 -1.83258509e-07] 1.8862004762126393e-07
iters = 70000 0.4272623251179017 [ 0.06144491 -1.27507475  3.13399196] [ 1.25562159e-10  5.26411847e-09 -2.16106547e-08] 2.22429113210391

finish 0.6608818193950241 [-0.01503926 -0.05057506 -0.0026336 ] [ 3.71369602e-14  2.39525511e-10 -8.76801253e-10]
finish 0.6641081061814383 [-0.00857227 -0.05626045 -0.00416424] [ 1.44653456e-11  1.97261429e-10 -9.59048008e-10]
finish 0.6705295649075635 [-0.00801956 -0.04897909 -0.00270901] [ 1.73614900e-10 -6.36464381e-10 -6.02311954e-10]
finish 0.6574554871752493 [-0.01671461 -0.0503649  -0.00266384] [ 9.98978678e-12  2.03772665e-10 -9.21690953e-10]
alpha = 10.0 f-score = 0.0
finish 0.6843447246352091 [-0.00831056 -0.00735023 -0.00083625] [-5.55783308e-10 -1.19139476e-10 -3.28668065e-11]
finish 0.6875660388092047 [-0.00648105 -0.00598625 -0.00050396] [7.39749817e-10 1.55822577e-10 4.29464381e-11]
finish 0.6858892331809251 [-0.00710368 -0.00726926 -0.0008925 ] [9.68221947e-10 2.06569983e-10 5.67334513e-11]
finish 0.6869729526180386 [-0.00658475 -0.00671743 -0.00069177] [-7.73217157e-10 -1.67034608e-10 -4.58709321e-11]
finish 0.6849366310212748 [-0.00810931 -0.00704241 -0.00081774] [-5