# TEST 3 - Logistic regression on Spambase data set 
## This code is a sligth modification of the one written by Luis Vasquez
___

We shall load the data, preprocess this data following separately:

* Standardize the columns so they all have mean 0 and unit variance
* Transform the features using $log(x_{ij} + 0.1)$
* Binarize the features using $I(x_{ij} > 0)$

and search the optimum of the regularization strength parameter in each data version.

### **1. Loading the data from spamData.mat**

In [1]:
from scipy.io import loadmat
import pandas as pd
import numpy as np

In [2]:
def load_data(data_path, features_path):
    spam_data = loadmat(data_path)
    x_train = spam_data['Xtrain']
    x_test = spam_data['Xtest']
    y_train = spam_data['ytrain']
    y_test = spam_data['ytest']
    
    with open(features_path, 'r') as file:
        feature_names = file.read()
        feature_names = feature_names.split("\n")
    
    train_df = pd.DataFrame(np.c_[x_train, y_train], columns=feature_names+['target'])
    test_df = pd.DataFrame(np.c_[x_test, y_test], columns=feature_names+['target'])
    
    return train_df, test_df
    
train_df, test_df = load_data('spamData.mat', 'spamFeatures.txt')

In [3]:
train_df.sample(3)

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,target
2126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.142,5.0,15.0,0.0
2113,0.0,14.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.8,5.0,9.0,0.0
1201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.763,21.428,62.0,150.0,1.0


### **2. Preprocessing/Transforming the data**

#### 2.1 Standardize the columns so they all have mean 0 and unit variance

In [4]:
def normalize(df):
    sub_df = df.loc[:,df.columns != 'target']
    x = (sub_df - sub_df.mean())/sub_df.std()
    x['target'] = df['target']
    return x

stnd_train_df = normalize(train_df)
stnd_test_df = normalize(test_df)

stnd_train_df.sample(3)

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,target
814,-0.338634,-0.166313,4.598254,-0.047176,-0.478906,-0.367423,-0.303635,-0.258522,-0.336761,-0.352659,...,-0.161163,0.590144,-0.149441,0.738576,-0.304666,-0.102887,-0.075008,-0.18893,-0.398745,1.0
1775,-0.338634,-0.166313,1.354877,-0.047176,-0.011736,-0.367423,-0.303635,-0.258522,0.837239,-0.352659,...,0.068134,-0.108979,-0.149441,-0.242704,-0.304666,0.001113,-0.079486,-0.17533,-0.225882,0.0
2558,-0.338634,-0.166313,-0.566358,-0.047176,8.939838,-0.367423,-0.303635,-0.258522,-0.336761,-0.352659,...,-0.161163,-0.496185,-0.149441,-0.30468,-0.304666,-0.102887,-0.124815,-0.229731,-0.459659,0.0


#### 2.2 Transform the features using $log(x_{ij} + 0.1)$

In [5]:
def transform_log(df):
    x_values = df.loc[:, df.columns != 'target']
    x_values = x_values.applymap(lambda x:np.log(x + 0.1))
    x_values['target'] = df['target']
    df = x_values
    return df

log_train_df = transform_log(train_df)
log_test_df = transform_log(test_df)

log_train_df.sample(3)

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,target
564,-2.302585,-2.302585,-2.302585,-2.302585,0.908259,-2.302585,-2.302585,0.908259,0.908259,0.908259,...,-2.302585,-2.302585,-2.302585,-0.701179,-2.302585,-2.302585,3.127374,5.004617,5.069533,1.0
1468,-2.302585,-2.302585,0.470004,-2.302585,-2.302585,-0.162519,-2.302585,-2.302585,-2.302585,-2.302585,...,-2.302585,-1.532477,-2.302585,-2.302585,-2.302585,-2.302585,0.659073,2.091864,3.095578,0.0
1291,0.09531,-2.302585,0.741937,-2.302585,-2.302585,-2.302585,-2.302585,-2.302585,-2.302585,-2.302585,...,-1.309333,-2.302585,-2.302585,-1.309333,-2.302585,-2.302585,0.583332,1.410987,3.095578,0.0


#### 2.3 Binarize the features using $I(x_{ij} > 0)$

In [6]:
def transform_binary(df):
    x_values = df.loc[:, df.columns != 'target']
    x_values = x_values.applymap(lambda x: int(x > 0))
    x_values['target'] = df['target']
    df = x_values
    return df

binary_train_df = transform_binary(train_df)
binary_test_df = transform_binary(test_df)

binary_test_df.sample(3)

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,target
432,1,0,1,0,1,1,1,0,1,1,...,0,1,0,1,1,0,1,1,1,1.0
1249,0,1,0,0,0,1,0,1,0,0,...,1,1,0,1,0,0,1,1,1,0.0
706,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0.0


### **3. Optimizing via Cross Validation**

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [8]:
param_grid = {
    "C" : np.arange(0.01,8,.03)
}

data = {
    "stnd": [stnd_train_df, stnd_test_df],
    "log": [log_train_df, log_test_df],
    "binary": [binary_train_df, binary_test_df]
}

# We shall gather the mean error rates in a DataFrame
mer = pd.DataFrame(index=data.keys(), columns=['train', 'test'])

for name, df_list in data.items():
    print("-"*15)
    for part, df in zip(mer.columns, df_list):
        x = (df.loc[:, df.columns != 'target']).values
        y = (df.loc[:, 'target']).values
        gs = GridSearchCV(LogisticRegression(), param_grid=param_grid, cv=10)
        gs.fit(x,y)
        print(name + "_" + part + "opt. C value: ", gs.best_params_['C'])
        
        # Now we calculate the mean error rate performed by the optimum cross validation
        mer.loc[name,part] = 1 - gs.best_score_

---------------
stnd_trainopt. C value:  6.52
stnd_testopt. C value:  6.76
---------------
log_trainopt. C value:  0.04
log_testopt. C value:  0.13
---------------
binary_trainopt. C value:  0.22
binary_testopt. C value:  1.51


In [9]:
mer

Unnamed: 0,train,test
stnd,0.0861338,0.0833333
log,0.0600326,0.0657552
binary,0.0698206,0.0761719


### **4. Conclusion**

From the DataFrame mer above, we conclude that the log data version has the best performance since it has the lowest mean error rates.