In [39]:
import numpy as np
import pandas as pd
import math
import time
from sklearn.linear_model import LinearRegression, SGDClassifier
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [40]:
df = pd.read_csv('spambase.csv', header = None)
df = df.sample(frac=1).reset_index(drop = True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.506,11,113,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.634,0.0,0.211,0.0,0.211,0.0,1.347,4,31,0
2,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,...,0.083,0.167,0.0,0.502,0.0,0.0,1.547,11,113,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.29,0.0,0.43,...,0.0,0.124,0.0,0.31,0.062,0.0,1.477,8,65,1
4,0.0,0.0,0.87,0.0,0.0,0.0,0.0,0.0,0.0,0.87,...,0.0,0.608,0.0,0.0,0.0,0.0,2.941,11,100,0


In [41]:
#normlize the data first 

In [42]:
feature_index = np.r_[0:57]
scaled_features = {}
for each in feature_index:
    mean, std = df[each].mean(), df[each].std()
    scaled_features[each] = [mean, std]
    df.loc[:, each] = (df[each] - mean)/std
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,-0.342396,-0.165054,-0.5567,-0.046894,-0.464264,-0.350228,-0.291762,-0.262533,-0.323267,-0.371324,...,-0.158436,-0.514251,-0.155181,-0.329876,-0.308321,-0.103037,-0.116154,-0.21126,-0.280844,0
1,-0.342396,-0.165054,-0.5567,-0.046894,-0.464264,-0.350228,-0.291762,-0.262533,-0.323267,-0.371324,...,2.445567,-0.514251,1.773624,-0.329876,0.549814,-0.103037,-0.121166,-0.247178,-0.41608,0
2,-0.342396,-0.165054,-0.5567,-0.046894,0.279216,-0.350228,-0.291762,-0.262533,-0.323267,-0.371324,...,0.182466,0.103455,-0.155181,0.285567,-0.308321,-0.103037,-0.114862,-0.21126,-0.280844,0
3,-0.342396,-0.165054,-0.5567,-0.046894,-0.464264,-0.350228,-0.291762,2.953851,-0.323267,0.295596,...,-0.158436,-0.055595,-0.155181,0.050178,-0.056168,-0.103037,-0.117068,-0.226653,-0.360007,1
4,-0.342396,-0.165054,1.169001,-0.046894,-0.464264,-0.350228,-0.291762,-0.262533,-0.323267,0.978025,...,-0.158436,1.734641,-0.155181,-0.329876,-0.308321,-0.103037,-0.070928,-0.21126,-0.302284,0


In [43]:
#1/3 data for testing
test_data = df[-df.shape[0]//3:]
test_data_feature = test_data[np.r_[0:57]]
test_data_label = test_data[57]
#2/3 data for training
train_data = df[:-df.shape[0]//3]
train_data_feature = train_data[np.r_[0:57]]
train_data_label = train_data[57]

In [44]:
def sigmoid(x):
    return 1.0/(1 + math.exp(-x))

In [45]:
def logRegression(train_x, train_y):
    iteration_time = 300
    delta = np.zeros((3,1))
    numSamples, numFeatures = np.shape(train_data)
    alpha = 0.01 #step size for iterartion
    for k in range(iteration_time):
        alpha = 4.0 / (1.0 + k) + 0.01
        i = np.random.randint(0, numSamples)
        h = sigmoid(train_x[i, :]*delta)
        error = train_y[i, 0] - h
        delta += alpha * train_x[i, :].transpose()*error
    return delta

In [46]:
def calAccuracyRat(train_x, train_y, delta):
    count = 0;
    numSamples, numFeatures = np.shape(train_x) #get size of training sample
    for i in range(numSamples):
        h = sigmoid(train_x[i,:] * delta)
        if h >= 0.5 and int (train_y[i, 0]) == 1:
            count += 1
        elif h <0.5 and train_y[i, 0] == 0 :
            count += 1
        return count

In [47]:
def showGraph(train_x, train_y, delta):
    numSamples, numFeatures = np.shape(train_x)
    for i in range(numSamples):
        if int(train_y[i, 0]) == 0:
            plt.plot(train_x[i, 1], train_x[i, 2], 'or')
        elif int(train_y[i, 0] == 1):
            plt.plot(train_x[i, 1], train_x[i, 2], 'ob')
            
    min_x = min(train_x[:,1])[0,0] - 1
    max_x = max(train_x[:,1])[0,0] + 1
    y_min_x = float(-delta[0,0] - delta[1,0] * min_x) / delta[2,0]  
    y_max_x = float(-delta[0,0] - delta[1,0] * max_x) / delta[2,0]  
    plt.plot([min_x, max_x], [y_min_x, y_max_x], 'y')  
    plt.xlabel('X1'); plt.ylabel('X2')  
    plt.show()

In [48]:
def testingLogR():
    train_x = train_data_feature.values
    train_y = train_data_label.values
    maxx=0.0
    numBegin=20  #起点数量
    for i in range(numBegin):
        delta=logRegression(train_x,train_y)
        cur=calAccuracyRate(train_x,train_y,delta)
        if cur>maxx:
            maxx=cur
            ans=delta
    numSamples,numFeatures=ny.shape(train_x)
    print("样本准确率为：",maxx*100/numSamples,"%")


In [49]:
testingLogR()

TypeError: only length-1 arrays can be converted to Python scalars