In [261]:
import pandas as pd
import numpy as np  
import random
import matplotlib.pyplot as plt  
import time  
import math
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


In [427]:
df = pd.read_csv('spambase.csv', header = None)
df = df.sample(frac=1).reset_index(drop = True)
df.head()

feature_index = np.r_[0:57]
scaled_features = {}
for each in feature_index:
    mean, std = df[each].mean(), df[each].std()
    scaled_features[each] = [mean, std]
    df.loc[:, each] = (df[each] - mean)/std
df.head()


#1/3 data for testing
test_data = df[-df.shape[0]//3:]
test_data_feature = test_data[np.r_[0:57]]
test_data_label = test_data[57]
#2/3 data for training
train_data = df[:-df.shape[0]//3]
train_data_feature = train_data[np.r_[0:57]]
train_data_label = train_data[57]

In [None]:
train_data_label.values

In [None]:
def sigmoid(inX):
    return 1.0/(1 + np.exp(-inX))

In [421]:

def trainLogRegres(train_x, train_y, opts):
    # calculate training time
    startTime = time.time()
 
    numSamples, numFeatures = np.shape(train_x)
    alpha = opts['alpha']; maxIter = opts['maxIter']
    weights = np.ones((numFeatures, 1))
 
    # optimize through gradient descent algorilthm
    for k in range(maxIter):
        if opts['optimizeType'] == 'gradDescent': # gradient descent algorilthm
            alpha = 4.0/(1.0 + k) + 0.01 ##
            i = np.random.randint(0, numSamples)##
            output = sigmoid(train_x[i, :] * weights) ##[i ,:]
            error = train_y[i,0] - output ##[i,0]
            weights = weights + alpha * train_x[i,:].transpose() * error #[i ,:]
        elif opts['optimizeType'] == 'stocGradDescent': # stochastic gradient descent
            for i in range(numSamples):
                output = sigmoid(train_x[i, :] * weights)
                error = train_y[i, 0] - output
                weights = weights + alpha * train_x[i, :].transpose() * error
        elif opts['optimizeType'] == 'smoothStocGradDescent': # smooth stochastic gradient descent
            # randomly select samples to optimize for reducing cycle fluctuations 
            dataIndex = range(numSamples)
            for i in range(numSamples):
                alpha = 4.0 / (1.0 + k + i) + 0.01
                randIndex = int(np.random.uniform(0, len(dataIndex)))
                output = sigmoid(train_x[randIndex, :] * weights)
                error = train_y[randIndex, 0] - output
                weights = weights + alpha * train_x[randIndex, :].transpose() * error
                del(dataIndex[randIndex]) # during one interation, delete the optimized sample
        else:
            raise NameError('Not support optimize method type!')
    
 
    print ('training complete! Took %fs!' % (time.time() - startTime))

    return weights


In [422]:
def testLogRegres(weights, test_x, test_y):
    numSamples, numFeatures = np.shape(test_x)
    matchCount = 0
    for i in range(numSamples):
        predict = sigmoid(sum(test_x[i] * weights))
        if np.all(predict > 0.5):
            matchCount += 1
            if np.all(test_y[i,0] == 1):
                matchCount += 1
        #if predict == bool(test_y[i, 0]):
        #if predict == bool(test_y):
               # matchCount += 1
        
        else:
            #if np.all(test_y[i,0] == 0):
            matchCount += 1
    accuracy = float(matchCount*10) / (numSamples)
    return accuracy

In [435]:
def showLogRegres(weights, train_x, train_y):
    
    #train_x and train_y is mat datatype
    train_x = train_data_feature.values
    train_x_matrix = np.mat(train_x)
    train_y = train_data_label.values
    train_y_matrix = np.mat(train_y).transpose()
    test_x = test_data_feature.values
    test_x_matrix = np.mat(test_x)
    test_y = test_data_label.values
    test_y_matrix = np.mat(test_y).transpose()
    opts = {'alpha': 0.01, 'maxIter': 200, 'optimizeType': 'gradDescent'}
    optimalWeights = trainLogRegres(train_x_matrix, train_y_matrix, opts) ##y_matrix
    test_acc = testLogRegres(optimalWeights, test_x_matrix, test_y_matrix)
    iterations = 70
    
    losses = {'train':[], 'iteration':[]}
    start = time.time()
    for ii in range(iterations):
        opts = {'alpha': 0.01, 'maxIter': 200, 'optimizeType': 'gradDescent'}
        optimalWeights = trainLogRegres(train_x_matrix, train_y_matrix, opts) ##y_matrix
        test_acc = testLogRegres(optimalWeights, test_x_matrix, test_y_matrix)
        losses['train'].append(optimalWeights)
        losses['iteration'].append(test_acc)
    elapsed_time =float(time.time() - start)
    print('Run iterations', iterations, ', total time use{:.2f}'.format(elapsed_time),'s')
       
    plt.plot(losses['train'],label = 'Training Acc')
    plt.plot(losses['iteration'], label = 'Testing Acc')
    _=plt.ylim()

In [436]:
train_x = train_data_feature.values
train_x_matrix = np.mat(train_x)
train_y = train_data_label.values
train_y_matrix = np.mat(train_y).transpose()
test_x = test_data_feature.values
test_x_matrix = np.mat(test_x)
test_y = test_data_label.values
test_y_matrix = np.mat(test_y).transpose()
opts = {'alpha': 0.01, 'maxIter': 200, 'optimizeType': 'gradDescent'}
optimalWeights = trainLogRegres(train_x_matrix, train_y_matrix, opts) ##y_matrix
testLogRegres(optimalWeights, test_x_matrix, test_y_matrix)


training complete! Took 0.017464s!


  


13.31812255541069

In [438]:
#showLogRegres(optimalWeights, train_x_matrix, train_y_matrix)