In [4]:

# coding=UTF-8
import pandas as pd
import numpy as ny  
import matplotlib.pyplot as plt  
import time  
import math
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
 
df = pd.read_csv('spambase.csv', header = None)
df = df.sample(frac=1).reset_index(drop = True)
df.head()

feature_index = np.r_[0:57]
scaled_features = {}
for each in feature_index:
    mean, std = df[each].mean(), df[each].std()
    scaled_features[each] = [mean, std]
    df.loc[:, each] = (df[each] - mean)/std
df.head()

#1/3 data for testing
test_data = df[-df.shape[0]//3:]
test_data_feature = test_data[np.r_[0:57]]
test_data_label = test_data[57]
#2/3 data for training
train_data = df[:-df.shape[0]//3]
train_data_feature = train_data[np.r_[0:57]]
train_data_label = train_data[57]

def sigmoid(x):  #sigmoid函数
    return 1.0/(1+math.exp(-x))
 
def loadData():  #读取数据
    train_x = []  
    train_y = []  
    fileIn = open('testSet.txt')  
    for line in fileIn.readlines():  
        lineArr = line.strip().split()  
        train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])  #1.0代表x的0次项
        train_y.append(float(lineArr[2]))  
    return ny.mat(train_x), ny.mat(train_y).transpose()  
 
def logRegression(train_x,train_y):
    iteration_time = 600   #最大迭代次数
    delta=ny.zeros((3,1))   #初始化参数为0向量
    numSamples,numFeatures=ny.shape(train_x)  #获取训练样本的规模
    alpha=0.01  #迭代步长
    for k in range(iteration_time):
        alpha = 4.0 / (1.0 + k) + 0.01  
        i=ny.random.randint(0, numSamples)
        h=sigmoid(train_x[i,:]*delta)
        error=train_y[i,0]-h
        delta+=alpha*train_x[i,:].transpose()*error
    return delta
 
def calAccuracyRate(train_x,train_y,delta):
    count=0   #记录划分正确的样本数
    numSamples,numFeatures=ny.shape(train_x)  #获取训练样本的规模
    for i in range(numSamples):
        h=sigmoid(train_x[i,:]*delta)
        if h>=0.5 and int(train_y[i,0])==1 :
            count=count+1
        elif h<0.5 and train_y[i,0]==0 :
            count=count+1
    return count
 
def showGraph(train_x,train_y,delta):
    numSamples,numFeatures=ny.shape(train_x)  #获取训练样本的规模
    # 画出样本点
    plt.figure(figsize=(12,8))  #设置绘制尺寸
    for i in range(numSamples):  
        if int(train_y[i, 0]) == 0:  
            plt.plot(train_x[i, 1], train_x[i, 2], 'or')  
        elif int(train_y[i, 0]) == 1:  
        plt.plot(train_x[i, 1], train_x[i, 2], 'ob')  
  
    # 绘制分割线
    min_x = min(train_x[:, 1])[0,0]-1  
    max_x = max(train_x[:, 1])[0,0]+1
    y_min_x = float(-delta[0,0] - delta[1,0] * min_x) / delta[2,0]  
    y_max_x = float(-delta[0,0] - delta[1,0] * max_x) / delta[2,0]  
    plt.plot([min_x, max_x], [y_min_x, y_max_x], 'y')  
    plt.xlabel('X1'); plt.ylabel('X2')  
    plt.show()  

 


IndentationError: expected an indented block (<ipython-input-4-900437c5582a>, line 77)

In [None]:
 
def testingLogR():
    train_x = train_data_feature.values
    train_y = train_data_label.values
    maxx=0.0
    numBegin=20  #起点数量
    for i in range(numBegin):
        delta=logRegression(train_x,train_y)
        cur=calAccuracyRate(train_x,train_y,delta)
        if cur>maxx:
            maxx=cur
            ans=delta
    numSamples,numFeatures=ny.shape(train_x)
    print("样本准确率为：",maxx*100/numSamples,"%")
    showGraph(train_x,train_y,ans)