In [1]:
# get logistic regression module
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import Snippets.log_regression as lr

# other libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the Spambase-Data set
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data', sep=',', header=None)

In [14]:
# Split dataframe into X_train, X_test, y_train, y_test
X = df.iloc[:, :-1] 
y = df.iloc[:, -1] 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [16]:
# Standardize
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [5]:
def count_FP(y_true, y_predict):
    FP = 0;
    for i, elem in enumerate(y_true):
        if elem == 0 and y_predict[i] == 1:
            FP += 1
    return FP

In [6]:
def count_FN(y_true, y_predict):
    FN = 0;
    for i, elem in enumerate(y_true):
        if elem == 1 and y_predict[i] == 0:
            FN += 1
    return FN

In [7]:
# Calculate the Fß measure for ß = (.5, 1)
def get_F_measure(beta, y_true, y_predict):
    TP = len(y_true) - sum(abs(y_true-y_predict))
    FP = count_FP(y_true, y_predict)
    FN = count_FN(y_true, y_predict)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F = (1+beta**2) * (precision * recall) / (beta**2 * precision + recall)
    return F

In [8]:
beta = lr.get_coefficients(X_train, y_train, steps=50000, gamma=5e-5, method='gradient')

In [13]:
y_predict = lr.predict(X_test, beta)
print("F-Measure (beta = 0.5): {}".format(get_F_measure(0.5, y_test, y_predict)))
print("F-Measure (beta = 1): {}".format(get_F_measure(1, y_test, y_predict)))

F-Measure (beta = 0.5): 0.957205504309693
F-Measure (beta = 1): 0.9565545901020023


### Logistic ridge regression

In [91]:
# def basis(X, mu, lambd, d):
#     X_transform = []
#     for col in pd.DataFrame(X).columns:
#         kappa_values = []
#         for i in range(d):
#             kap = kappa(X[col], mu[i], lambd)
#             kappa_values.append(kap)
#         X_transform.append(kappa_values)
#     print(np.asarray(X_transform).shape)
#     return np.asarray(X_transform)

In [92]:
# def kappa(X, mu, lambd):
#     L2 = np.linalg.norm(X-mu)
#     kap = np.exp(-1/lambd*L2)
     
#     return kap

In [8]:
def ridge_get_coefficients(X_train, y_train, delta):
    #pad by one (I should really make a function for that)
    X_train = np.hstack((np.ones((X_train.shape[0],1)), X_train)) 
    
    # (X^tX + delta^2*I)^-1 * X^tY
    
    XtX = np.dot(X_train.T, X_train)
    deltaI = delta**2 * np.identity(X_train.shape[1])
    xy = np.dot(X_train.T, y_train)
    beta = np.dot(np.linalg.inv(XtX + deltaI), xy)
    
    return beta
    

In [9]:
def predict_RR(X_test, X_train, y_train, delta):
    beta = ridge_get_coefficients(X_train, y_train, delta)
    # pad also X_test
    X_test = np.hstack((np.ones((X_test.shape[0],1)), X_test)) 
    y_predict = np.dot(X_test, beta)
    return y_predict

In [10]:
# Sigmoid function to map results between 0-1
def sigmoid(result):
    result = np.exp(result) / (1 + np.exp(result))
    return result

In [11]:
# Use sklearns k fold helper
from sklearn.model_selection import KFold
kf = KFold(n_splits = 5)

max_score = 0
max_delta = 0

for i, (train, validate) in enumerate(kf.split(X_train)):
    X_tr, X_val = X_train[train], X_train[validate]
    y_tr, y_val = y_train.values[train], y_train.values[validate]
    
    delta = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]
    for _ in range(len(delta)):
        y_predict = predict_RR(X_val, X_tr, y_tr, delta[_])
        y_predict = np.round(sigmoid(y_predict))
        
        score = get_F_measure(0.5, y_val, y_predict)
        if score > max_score:
            max_score = score
            max_delta = delta[_]
        print('Fold {}-{}: FScore = {}, delta = {}'.format(i+1, _+1, score, delta[_]))

print('Max Score: {} with delta: {}'.format( max_score, max_delta))

Fold 1-1: FScore = 0.5540214014497757, delta = 0.0001
Fold 1-2: FScore = 0.5540214014497757, delta = 0.001
Fold 1-3: FScore = 0.5540214014497757, delta = 0.01
Fold 1-4: FScore = 0.5540214014497757, delta = 0.1
Fold 1-5: FScore = 0.5540214014497757, delta = 1.0
Fold 1-6: FScore = 0.5570886512590547, delta = 10.0
Fold 1-7: FScore = 0.7592838196286472, delta = 100.0
Fold 1-8: FScore = 0.7936772046589019, delta = 1000.0
Fold 1-9: FScore = 0.7936772046589019, delta = 10000.0
Fold 2-1: FScore = 0.5775180474389825, delta = 0.0001
Fold 2-2: FScore = 0.5775180474389825, delta = 0.001
Fold 2-3: FScore = 0.5775180474389825, delta = 0.01
Fold 2-4: FScore = 0.5775180474389825, delta = 0.1
Fold 2-5: FScore = 0.5775180474389825, delta = 1.0
Fold 2-6: FScore = 0.5851063829787235, delta = 10.0
Fold 2-7: FScore = 0.7751164337990686, delta = 100.0
Fold 2-8: FScore = 0.8273026315789473, delta = 1000.0
Fold 2-9: FScore = 0.8273026315789473, delta = 10000.0
Fold 3-1: FScore = 0.5168813087365123, delta = 0.0

In [12]:
# It appears that delta = 1000 yields the best F-Scores on the validation sets
# predict on X_test with delta = 1000
y_predict = predict_RR(X_test=X_test, X_train=X_train, y_train=y_train, delta=1000)
y_predict = np.round(sigmoid(y_predict))
print("F-Measure (beta = 0.5): {}".format(get_F_measure(0.5, y_test, y_predict)))
print("F-Measure (beta = 1): {}".format(get_F_measure(1, y_test, y_predict)))

F-Measure (beta = 0.5): 0.8348595848595848
F-Measure (beta = 1): 0.8840404040404041
