In [1]:
import numpy as np
import pandas as pd
import cvxopt
from cvxopt import matrix, solvers

In [2]:
train_data = pd.read_csv("spam_train.data",header = None)
test_data = pd.read_csv("spam_test.data",header = None)
Valid_data = pd.read_csv("spam_validation.data",header = None)

In [3]:
def targetvar(data):
    data.loc[data[57]==0,57]=-1
    return data

In [4]:
train_data = targetvar(train_data)
test_data = targetvar(test_data)
Valid_data = targetvar(Valid_data)

In [5]:
def split(data):
    n = len(data.columns)-2
    X = data.iloc[:,0:n+1]
    Y = data.iloc[:,[n+1]]
    return (X,Y)

In [6]:
train_x,train_y = split(train_data)
Valid_x,Valid_y = split(Valid_data)
test_x,test_y = split(test_data)

In [7]:
m,n = train_x.shape[0],train_x.shape[1]

In [8]:
def P_calc():
    P = np.zeros((m+n+1,m+n+1))
    for i in range(n):
        P[i][i]=1
    return matrix(P,tc='d')

In [9]:
def Q_calc(C):
    Q = np.zeros((m+n+1,1))
    for i in range(n,m+n):
        Q[i][0]=C
    return matrix(Q,tc='d')

In [10]:
def G_calc(X,Y):
    G = np.zeros((2*m,m+n+1))
    for i in range(m):
        for j in range(n):
            G[i][j]=-1*X[i][j]*Y[i][0]
        G[i][n+i]=-1
        G[i][m+n]=-1*Y[i]
        G[m+i][n+i]=-1
    return matrix(G,tc='d')

In [11]:
def H_calc():
    H = np.zeros((2*m,1))
    for i in range(m):
        H[i][0]=-1
    return matrix(H,tc='d')

In [12]:
P,G,H = P_calc(),G_calc(train_x.values,train_y.values),H_calc()

In [13]:
def Accuracy(X,Y,W,b):
#     print(W,len(W),X.values[0],len(X.values[0]))
#     X['pred']=X.apply(lambda x: 1 if (np.dot(np.array(W),x.values)+b)>=0 else -1)
    prod =X.dot(np.array(W))+b
    Y = Y[57].to_numpy()
    res = sum(Y*np.array(prod)>0)
    return (res/len(X)*100)

In [14]:
Valid_acc =[0]*9
train_acc = [0]*9
test_acc = 0
final_C = 0
final_b=0
final_w = []

In [15]:
for i in range(9):
    C = 10**i
    Q = Q_calc(C)
    result = cvxopt.solvers.qp(P, Q, G, H)
    coeff = result['x']
    W = []
    for j in range(n):
        W.append(coeff[j])
    b = coeff[m+n]
    train_Acc = Accuracy(train_x,train_y,W,b)
    train_acc[i]= train_Acc
    print("Training accuracy with C:",C," is", train_Acc)
    Valid_Acc = Accuracy(Valid_x,Valid_y,W,b)
    print("Validation accuracy with C:",C," is", Valid_Acc)
    if Valid_Acc>max(Valid_acc):
        final_C = C
        final_b = b
        final_w = W
    elif Valid_Acc==max(Valid_acc):
        if train_Acc>max(train_acc):
            final_C = C
            final_b = b 
            final_w = W
    Valid_acc[i] = Valid_Acc

     pcost       dcost       gap    pres   dres
 0: -1.8124e+03  1.0398e+04  8e+04  6e+00  4e+04
 1:  5.6137e+03 -7.3332e+03  2e+04  1e+00  7e+03
 2:  3.7452e+03 -2.4540e+03  7e+03  4e-01  2e+03
 3:  2.3041e+03 -9.9333e+02  4e+03  2e-01  1e+03
 4:  1.6237e+03 -3.8918e+02  2e+03  1e-01  6e+02
 5:  1.3035e+03 -9.5546e+01  2e+03  6e-02  4e+02
 6:  1.1640e+03  1.5815e+01  1e+03  4e-02  3e+02
 7:  1.1113e+03  8.9810e+01  1e+03  3e-02  2e+02
 8:  1.0293e+03  1.6990e+02  9e+02  2e-02  1e+02
 9:  8.2001e+02  2.9315e+02  5e+02  1e-02  6e+01
10:  7.1530e+02  3.5051e+02  4e+02  6e-03  4e+01
11:  6.2597e+02  3.9397e+02  2e+02  3e-03  2e+01
12:  5.6753e+02  4.2284e+02  1e+02  2e-03  1e+01
13:  5.3370e+02  4.3939e+02  1e+02  1e-03  6e+00
14:  5.1168e+02  4.5038e+02  6e+01  5e-04  3e+00
15:  4.8730e+02  4.6134e+02  3e+01  3e-05  2e-01
16:  4.7777e+02  4.6801e+02  1e+01  7e-06  4e-02
17:  4.7454e+02  4.7040e+02  4e+00  2e-06  1e-02
18:  4.7311e+02  4.7153e+02  2e+00  5e-07  3e-03
19:  4.7243e+02  4.72

26:  4.8314e+06  4.0545e+06  8e+05  9e-15  6e-12
27:  4.6116e+06  4.1517e+06  5e+05  7e-15  1e-12
28:  4.5093e+06  4.2011e+06  3e+05  5e-15  2e-12
29:  4.4244e+06  4.2498e+06  2e+05  5e-15  2e-12
30:  4.3873e+06  4.2755e+06  1e+05  5e-15  5e-12
31:  4.3612e+06  4.2914e+06  7e+04  6e-15  3e-12
32:  4.3391e+06  4.3102e+06  3e+04  5e-15  1e-12
33:  4.3265e+06  4.3211e+06  5e+03  5e-15  2e-12
34:  4.3245e+06  4.3229e+06  2e+03  5e-15  6e-12
35:  4.3238e+06  4.3235e+06  3e+02  5e-15  1e-11
36:  4.3237e+06  4.3237e+06  2e+01  6e-15  2e-11
37:  4.3237e+06  4.3237e+06  3e-01  5e-15  1e-11
Optimal solution found.
(3000,)
(3000,)
Training accuracy with C: 10000  is 94.83333333333334
(800,)
(800,)
Validation accuracy with C: 10000  is 93.75
     pcost       dcost       gap    pres   dres
 0: -2.4068e+13  3.1068e+13  1e+14  3e+05  1e+04
 1:  7.8561e+12 -1.1937e+13  4e+13  7e+04  3e+03
 2:  6.3005e+12 -6.1208e+12  2e+13  3e+04  1e+03
 3:  3.5902e+12 -2.7267e+12  9e+12  1e+04  5e+02
 4:  2.9488e+12 

11:  7.8018e+16 -1.2499e+15  8e+16  7e+03  3e-01
12:  9.9410e+15 -1.3151e+14  1e+16  8e-09  4e-12
13:  2.4769e+15 -2.7092e+13  3e+15  3e-09  2e-12
14:  2.3794e+15 -1.4531e+13  2e+15  3e-09  1e-12
15:  1.9526e+14 -2.5818e+11  2e+14  8e-10  6e-12
16:  2.3047e+12  2.2004e+10  2e+12  9e-11  2e-12
17:  2.3063e+11  2.2599e+10  2e+11  8e-12  6e-12
18:  1.6338e+11  2.5951e+10  1e+11  4e-12  2e-12
19:  1.1957e+11  2.8802e+10  9e+10  2e-12  2e-12
20:  9.5079e+10  3.1221e+10  6e+10  1e-12  1e-12
21:  8.4848e+10  3.2912e+10  5e+10  8e-13  2e-13
22:  7.9305e+10  3.3815e+10  5e+10  7e-13  4e-13
23:  7.1973e+10  3.4941e+10  4e+10  5e-13  1e-13
24:  6.4330e+10  3.6231e+10  3e+10  3e-13  6e-13
25:  5.9402e+10  3.7298e+10  2e+10  2e-13  2e-12
26:  5.6046e+10  3.8281e+10  2e+10  1e-13  1e-12
27:  5.3169e+10  3.9129e+10  1e+10  7e-14  1e-12
28:  5.0854e+10  3.9779e+10  1e+10  4e-14  1e-12
29:  4.9577e+10  4.0233e+10  9e+09  3e-14  3e-13
30:  4.6828e+10  4.1290e+10  6e+09  1e-14  7e-12
31:  4.5644e+10  4.1

In [16]:
Valid_acc

[93.5, 93.875, 93.875, 93.75, 93.75, 93.75, 93.75, 93.75, 93.75]

In [17]:
train_acc

[94.46666666666667,
 94.73333333333333,
 94.86666666666666,
 94.83333333333334,
 94.83333333333334,
 94.83333333333334,
 94.83333333333334,
 94.83333333333334,
 94.83333333333334]

In [18]:
test_acc = Accuracy(test_x,test_y,final_w,final_b)

(801,)
(801,)


In [20]:
final_b

-1.0201993846160402

In [19]:
test_acc

63.17103620474407

In [21]:
final_C

10

In [22]:
final_w

[-0.3115233217727882,
 -0.0012388995604349098,
 -0.05108246004132275,
 1.177842789857678,
 0.4286998368799059,
 1.269150822448888,
 1.506416870378792,
 0.4016812390469861,
 0.5602494662524062,
 0.05075664506314839,
 -0.16003030007110008,
 -0.15381780817122867,
 0.049482236467909724,
 0.16028413277612913,
 -0.16081118236654612,
 0.9323722180281494,
 0.6014505340264568,
 0.060479646533057575,
 0.14382037431252384,
 0.3914920717210573,
 0.09938797942370525,
 4.2000059223457455,
 2.8136072339302465,
 1.1986321587762958,
 -1.0768837211456743,
 -0.49285932481002676,
 -6.3129480590004485,
 0.1993815147492203,
 -0.6971241984842487,
 -0.3468730822954625,
 0.4824760718806834,
 0.29657575574666384,
 -0.3226382033605984,
 -1.6059416676300609,
 -1.683876979340976,
 0.6057449415039657,
 -0.3605391233116566,
 0.12459925515715015,
 -0.44754479463704516,
 -0.5363981629584565,
 -4.800004014283575,
 -1.459496960647851,
 -0.9851454040722055,
 -0.8544493740597742,
 -0.43029582817674594,
 0.8122835253777533