In [1]:
import math
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [2]:
def read_file(filename):
    read_data = pd.read_csv(filename, header=None)
    return read_data

In [5]:
df = read_file("../data/spam.csv")
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [6]:
def get_x(df):
    return df.iloc[:, 0:-1].values

def get_y(df):
    return df.iloc[:, -1].values

def standardize(df):
    x = get_x(df)
    df_y = df.iloc[:, 57]

    standard_scaler = preprocessing.StandardScaler()
    x_scaled = standard_scaler.fit_transform(x)

    df_x = pd.DataFrame(x_scaled)
    df_scaled = df_x.join(df_y)

    return df_scaled

In [7]:
df = standardize(df)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,-0.342434,0.330885,0.712859,-0.0469,0.011565,-0.350266,-0.291794,-0.262562,-0.323302,-0.371364,...,-0.158453,-0.514307,-0.155198,0.624007,-0.308355,-0.103048,-0.045247,0.045298,-0.008724,1
1,0.345359,0.051909,0.43513,-0.0469,-0.256117,0.672399,0.244743,-0.08801,-0.323302,1.086711,...,-0.158453,-0.026007,-0.155198,0.126203,0.423783,0.008763,-0.002443,0.250563,1.228324,1
2,-0.145921,-0.165072,0.851723,-0.0469,1.364846,0.343685,0.193644,0.03667,1.974017,0.016422,...,-0.117376,0.014684,-0.155198,0.008496,0.440053,-0.079754,0.145921,2.221106,3.258733,1
3,-0.342434,-0.165072,-0.556761,-0.0469,0.472573,-0.350266,0.500237,1.308402,0.789462,0.605857,...,-0.158453,-0.007511,-0.155198,-0.161934,-0.308355,-0.103048,-0.05215,-0.062466,-0.152222,1
4,-0.342434,-0.165072,-0.556761,-0.0469,0.472573,-0.350266,0.500237,1.308402,0.789462,0.605857,...,-0.158453,-0.01491,-0.155198,-0.164387,-0.308355,-0.103048,-0.05215,-0.062466,-0.152222,1


In [19]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def loss(p, y):
    EPSILON = 1e-10
    return -np.average(y * np.log(p + EPSILON) + (1 - y) * np.log(1 - p + EPSILON))

def predict_probability(x, w):
    z = np.dot(x, w)
    return sigmoid(z)

def accuracy(df, w, verbose=False):
    x = get_x(df)
    y = get_y(df)
    predictions = predict(x, w)

    acc = np.average(predictions == y)

    if verbose:
        print("Accuracy: %0.4f\n" % acc)
    return acc

In [9]:
def train(df, iterations=1500, learning_rate=0.1, lambda_reg=0.01):
    x = get_x(df)
    y = get_y(df)

    w_train = np.zeros(x.shape[1])

    for it in range(iterations):
        predictions = predict_probability(x, w_train)
        
        gradient = np.dot(x.T, (predictions - y)) / y.size
        regularization = lambda_reg * w_train / y.size

        w_train -= learning_rate * (gradient + regularization)

        if it % (iterations/5) == 0:
            print("It. %4d\t|\tLoss: %0.4f" % (it, loss(predictions, y)))
    return w_train

In [10]:
w = train(df)
w

It.    0	|	Loss: 0.6931
It.  300	|	Loss: 0.2612
It.  600	|	Loss: 0.2480
It.  900	|	Loss: 0.2428
It. 1200	|	Loss: 0.2400


array([-6.57753052e-02, -1.27335937e-01,  8.77544688e-02,  4.41630764e-01,
        3.78545403e-01,  1.65242811e-01,  1.31971226e+00,  2.61207269e-01,
        1.69470312e-01,  7.07103315e-02,  5.85575100e-04, -1.36189211e-01,
       -2.75821697e-02,  2.06277506e-02,  4.82494393e-01,  5.33528097e-01,
        4.28965209e-01,  1.48938763e-01,  1.75268854e-01,  6.29714931e-01,
        2.59472629e-01,  4.80197190e-01,  1.38013904e+00,  4.35189903e-01,
       -1.05024465e+00, -4.70936548e-01, -7.25824652e-01,  1.78438478e-01,
       -1.85172424e-01, -1.58779913e-01, -6.34125952e-02,  6.45431277e-02,
       -3.84042448e-01,  4.24564165e-02, -2.26639056e-01,  2.25004476e-01,
       -5.50362870e-02, -6.99087197e-02, -1.88472434e-01, -1.09120234e-01,
       -2.71251729e-01, -5.50188297e-01, -1.16945942e-01, -4.05734132e-01,
       -5.28935370e-01, -5.88261349e-01, -1.41604856e-01, -3.33273091e-01,
       -3.57932829e-01, -6.67079307e-02, -1.07656158e-01,  5.29302056e-01,
        1.40364358e+00,  

In [11]:
def predict(x, w, threshold=0.5):
    prob = predict_probability(x, w)
    return prob >= threshold

def get_block_data(df, fold, tot_folds):
    fold_size = math.floor(df.shape[0] / tot_folds)

    start_index = fold_size * fold
    end_index = start_index + fold_size

    df_test = df.loc[start_index:end_index]
    df.drop(df.loc[start_index:end_index].index, inplace=True)

    return df, df_test

def shuffle(df):
    return df.sample(frac=1).reset_index(drop=True)

In [21]:
def cross_validation(df, learning_rate=0.1, lambda_reg=0.01, folds=10):
    avg_acc = 0

    for i in range(folds):
        print("\nFold number " + str(i+1))
        tr_data, test_data = get_block_data(df.copy(), i, folds)
        w = train(
            tr_data,
            learning_rate=learning_rate,
            lambda_reg=lambda_reg
        )
        avg_acc += accuracy(test_data, w)

        

    avg_acc /= folds

    print("\nAVG acc: %0.4f" % avg_acc)
    return avg_acc

In [22]:
df = shuffle(df)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,-0.342434,-0.165072,-0.556761,-0.0469,-0.464314,-0.350266,-0.291794,-0.262562,-0.323302,-0.371364,...,-0.158453,1.253929,-0.155198,-0.329912,-0.308355,-0.103048,-0.12162,-0.247205,-0.421074,0
1,-0.342434,-0.165072,-0.556761,-0.0469,-0.464314,-0.350266,-0.291794,-0.262562,-0.323302,-0.371364,...,-0.158453,-0.514307,-0.155198,-0.329912,-0.308355,-0.103048,-0.126884,-0.252336,-0.444165,0
2,1.360673,-0.165072,4.164636,-0.0469,-0.077663,-0.350266,0.37249,-0.262562,1.543269,-0.371364,...,-0.158453,1.912393,-0.155198,0.050184,-0.308355,-0.103048,0.011268,0.096614,0.466302,1
3,0.738384,-0.165072,-0.556761,-0.0469,-0.464314,-0.350266,-0.291794,-0.262562,-0.323302,-0.371364,...,-0.158453,0.13306,0.375053,-0.187683,-0.308355,-0.103048,-0.123574,-0.242073,-0.343552,0
4,-0.342434,-0.165072,-0.556761,-0.0469,0.963323,-0.350266,-0.291794,-0.262562,-0.323302,0.373185,...,0.757565,-0.514307,-0.155198,-0.329912,-0.308355,-0.103048,-0.120296,-0.242073,-0.37654,0


In [23]:
cross_validation(df)


Fold number 1
It.    0	|	Loss: 0.6931
It.  300	|	Loss: 0.2610
It.  600	|	Loss: 0.2477
It.  900	|	Loss: 0.2424
It. 1200	|	Loss: 0.2395

Fold number 2
It.    0	|	Loss: 0.6931
It.  300	|	Loss: 0.2633
It.  600	|	Loss: 0.2498
It.  900	|	Loss: 0.2443
It. 1200	|	Loss: 0.2413

Fold number 3
It.    0	|	Loss: 0.6931
It.  300	|	Loss: 0.2510
It.  600	|	Loss: 0.2372
It.  900	|	Loss: 0.2322
It. 1200	|	Loss: 0.2296

Fold number 4
It.    0	|	Loss: 0.6931
It.  300	|	Loss: 0.2623
It.  600	|	Loss: 0.2491
It.  900	|	Loss: 0.2438
It. 1200	|	Loss: 0.2409

Fold number 5
It.    0	|	Loss: 0.6931
It.  300	|	Loss: 0.2592
It.  600	|	Loss: 0.2457
It.  900	|	Loss: 0.2403
It. 1200	|	Loss: 0.2373

Fold number 6
It.    0	|	Loss: 0.6931
It.  300	|	Loss: 0.2592
It.  600	|	Loss: 0.2452
It.  900	|	Loss: 0.2395
It. 1200	|	Loss: 0.2364

Fold number 7
It.    0	|	Loss: 0.6931
It.  300	|	Loss: 0.2634
It.  600	|	Loss: 0.2505
It.  900	|	Loss: 0.2453
It. 1200	|	Loss: 0.2424

Fold number 8
It.    0	|	Loss: 0.6931
It.  300	|	Loss:

0.9206073752711494