# Using regularized logistic regression to classify email

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils
from sklearn import linear_model
#import sklearn.cross_validation
from sklearn import model_selection
#from sklearn.cross_validation import KFold
import scipy.io

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [3]:
# No modifications in this cell
# complete the functions in utils.py; then run the cell

Xtrain,Xtest,ytrain,ytest = utils.load_spam_data()


# Preprocess the data 

Xtrain_std,mu,sigma = utils.std_features(Xtrain)
Xtrain_logt = utils.log_features(Xtrain)
Xtrain_bin = utils.bin_features(Xtrain)

Xtest_std = (Xtest - mu)/sigma
Xtest_logt = utils.log_features(Xtest)
Xtest_bin = utils.bin_features(Xtest)

# find good lambda by cross validation for these three sets

def run_dataset(X,ytrain,Xt,ytest,typea,penalty):

    best_lambda = utils.select_lambda_crossval(X,ytrain,0.1,5.1,0.5,penalty)
    print("best_lambda = %.3f" %best_lambda)

    # train a classifier on best_lambda and run it
    if penalty == "l2":
        print(f"{penalty}\n")
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='lbfgs',fit_intercept=True,max_iter=1000)
    else:
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='liblinear',fit_intercept=True,max_iter=1000)
    lreg.fit(X, ytrain)
    print("Coefficients = %s" %lreg.intercept_,lreg.coef_)
    predy = lreg.predict(Xt)
    print("Accuracy on set aside test set for %s = %.4f" %(typea, np.mean(predy==ytest)))

print("L2 Penalty experiments -----------")
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l2")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l2")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l2")

print("L1 Penalty experiments -----------")
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l1")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l1")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l1")

L2 Penalty experiments -----------
best_lambda = 0.100
l2

Coefficients = [-4.8600139] [[-2.74849489e-02 -2.25102956e-01  1.21909825e-01  2.27576317e+00
   2.70513880e-01  2.32896386e-01  9.28003099e-01  2.95213820e-01
   1.62436599e-01  6.78859726e-02 -8.32308921e-02 -1.60303840e-01
  -4.73303779e-02  1.09295555e-02  1.88629821e-01  8.20214946e-01
   5.10110961e-01  3.99081617e-02  2.67692614e-01  3.47612306e-01
   2.60420628e-01  3.63368920e-01  7.24494492e-01  1.96760499e-01
  -3.15921228e+00 -4.04133371e-01 -1.25631714e+01 -6.06044685e-02
  -1.55527248e+00 -5.61701735e-02 -3.27926430e-02  4.07387981e-01
  -3.68384977e-01 -1.39204855e+00 -5.81907790e-01  4.43910671e-01
   4.22445018e-02 -1.56975749e-01 -4.55689235e-01 -1.02193006e-01
  -3.52529217e+00 -1.73831925e+00 -4.36629010e-01 -1.06174294e+00
  -9.18491813e-01 -1.75102207e+00 -1.67557090e-01 -9.53957476e-01
  -3.65555990e-01 -1.36345044e-01 -6.58702266e-02  2.06736820e-01
   1.70656904e+00  1.22542721e+00 -3.33700159e-01  1.55