In [1]:
import math
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [2]:
def read_file(filename):
    read_data = pd.read_csv(filename, header=None)
    return read_data

In [3]:
df = read_file("../data/spam.csv")
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [4]:
def get_x(df):
    return df.iloc[:, 0:-1].values

def get_y(df):
    return df.iloc[:, -1].values

def standardize(df):
    x = get_x(df)
    df_y = df.iloc[:, 57]

    standard_scaler = preprocessing.StandardScaler()
    x_scaled = standard_scaler.fit_transform(x)

    df_x = pd.DataFrame(x_scaled)
    df_scaled = df_x.join(df_y)

    return df_scaled

In [5]:
df = standardize(df)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,-0.342434,0.330885,0.712859,-0.0469,0.011565,-0.350266,-0.291794,-0.262562,-0.323302,-0.371364,...,-0.158453,-0.514307,-0.155198,0.624007,-0.308355,-0.103048,-0.045247,0.045298,-0.008724,1
1,0.345359,0.051909,0.43513,-0.0469,-0.256117,0.672399,0.244743,-0.08801,-0.323302,1.086711,...,-0.158453,-0.026007,-0.155198,0.126203,0.423783,0.008763,-0.002443,0.250563,1.228324,1
2,-0.145921,-0.165072,0.851723,-0.0469,1.364846,0.343685,0.193644,0.03667,1.974017,0.016422,...,-0.117376,0.014684,-0.155198,0.008496,0.440053,-0.079754,0.145921,2.221106,3.258733,1
3,-0.342434,-0.165072,-0.556761,-0.0469,0.472573,-0.350266,0.500237,1.308402,0.789462,0.605857,...,-0.158453,-0.007511,-0.155198,-0.161934,-0.308355,-0.103048,-0.05215,-0.062466,-0.152222,1
4,-0.342434,-0.165072,-0.556761,-0.0469,0.472573,-0.350266,0.500237,1.308402,0.789462,0.605857,...,-0.158453,-0.01491,-0.155198,-0.164387,-0.308355,-0.103048,-0.05215,-0.062466,-0.152222,1


In [6]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def loss(p, y, w, lambda_reg):
    epsilon = 1e-10
    
    error_loss = -np.average(
        y * np.log(p + epsilon) + (1 - y) * np.log(1 - p + epsilon)
    )
    reg_loss = lambda_reg * np.sum(np.square(w)) / (2 * y.size)
    
    return error_loss + reg_loss

def predict_probability(x, w, b):
    z = np.dot(x, w) + b
    return sigmoid(z)

def accuracy(df, w, b, verbose=False):
    x = get_x(df)
    y = get_y(df)
    predictions = predict(x, w, b)

    acc = np.average(predictions == y)

    if verbose:
        print("Accuracy: %0.4f\n" % acc)
    return acc

In [7]:
def train(df, iterations=1500, learning_rate=0.1, lambda_reg=0.01):
    x = get_x(df)
    y = get_y(df)

    w_train = np.zeros(x.shape[1])
    b = 0
    
    loss_array = []

    for it in range(iterations):
        predictions = predict_probability(x, w_train, b)
        
        gradient_w = np.dot(x.T, (predictions - y))
        gradient_b = np.average(predictions - y)
        regularization = lambda_reg * w_train

        w_train -= learning_rate * (gradient_w + regularization) / y.size
        b -= learning_rate * gradient_b

        if it % (iterations/5) == 0 or it+1 == iterations:
            print("It. %4d\t|\tLoss: %0.4f" % 
                  (it, loss(predictions, y, w_train, lambda_reg))
                 )
    
        if it % 10 == 0:
            loss_array.append(
                loss(predictions, y, w_train, lambda_reg)
            )
            
    return w_train, b

In [8]:
def predict(x, w, b, threshold=0.5):
    prob = predict_probability(x, w, b)
    return prob >= threshold

def get_block_data(df, fold, tot_folds):
    fold_size = math.floor(df.shape[0] / tot_folds)

    start_index = fold_size * fold
    end_index = start_index + fold_size

    df_test = df.loc[start_index:end_index]
    df.drop(df.loc[start_index:end_index].index, inplace=True)

    return df, df_test

def shuffle(df):
    return df.sample(frac=1).reset_index(drop=True)

In [30]:
def cross_validation(df, iterations= 10, learning_rate=10, 
                     lambda_reg=1, folds=10):
    avg_acc = 0
    df = shuffle(df)

    for i in range(folds):
        print("\nFold number " + str(i+1))
        tr_data, test_data = get_block_data(df.copy(), i, folds)
        w, b = train(
            tr_data,
            learning_rate=learning_rate,
            lambda_reg=lambda_reg
        )
        avg_acc += accuracy(test_data, w, b)

    avg_acc /= folds

    print("\nAVG acc: %0.4f" % avg_acc)
    return avg_acc

In [31]:
cross_validation(df)


Fold number 1
It.    0	|	Loss: 0.6987
It.  300	|	Loss: 0.2103
It.  600	|	Loss: 0.2102
It.  900	|	Loss: 0.2102
It. 1200	|	Loss: 0.2102
It. 1499	|	Loss: 0.2102

Fold number 2
It.    0	|	Loss: 0.6986
It.  300	|	Loss: 0.2082
It.  600	|	Loss: 0.2081
It.  900	|	Loss: 0.2081
It. 1200	|	Loss: 0.2081
It. 1499	|	Loss: 0.2081

Fold number 3
It.    0	|	Loss: 0.6986
It.  300	|	Loss: 0.2129
It.  600	|	Loss: 0.2129
It.  900	|	Loss: 0.2129
It. 1200	|	Loss: 0.2129
It. 1499	|	Loss: 0.2129

Fold number 4
It.    0	|	Loss: 0.6986
It.  300	|	Loss: 0.2119
It.  600	|	Loss: 0.2118
It.  900	|	Loss: 0.2118
It. 1200	|	Loss: 0.2118
It. 1499	|	Loss: 0.2118

Fold number 5
It.    0	|	Loss: 0.6986
It.  300	|	Loss: 0.2144
It.  600	|	Loss: 0.2144
It.  900	|	Loss: 0.2144
It. 1200	|	Loss: 0.2144
It. 1499	|	Loss: 0.2144

Fold number 6
It.    0	|	Loss: 0.6986
It.  300	|	Loss: 0.2092
It.  600	|	Loss: 0.2092
It.  900	|	Loss: 0.2092
It. 1200	|	Loss: 0.2092
It. 1499	|	Loss: 0.2092

Fold number 7
It.    0	|	Loss: 0.6986
It.  30

0.9249457700650758