In [3]:
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

# 一.读取数据与数据预处理

In [4]:
data = pd.read_csv("D:/python/jupyter/ML-course-lab/train.csv")

In [4]:
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


### 把数据中price_range的0、1、2、3转化为对应的0(low)或1(high)

In [5]:
data.loc[data.price_range <= 1, 'price_range'] = 0
data.loc[data.price_range >= 2, 'price_range'] = 1

In [6]:
data['price_range']

0       0
1       1
2       1
3       1
4       0
       ..
1995    0
1996    1
1997    1
1998    0
1999    1
Name: price_range, Length: 2000, dtype: int64

### 标准化数据

In [7]:
x = data.drop('price_range', axis=1)
x_min = data.drop('price_range', axis=1).min()
x_max = data.drop('price_range', axis=1).max()
x = (x - x_min) / (x_max - x_min)
x

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,0.227789,0.0,0.68,0.0,0.052632,0.0,0.080645,0.555556,0.900000,0.142857,0.10,0.010204,0.170895,0.612774,0.285714,0.388889,0.944444,0.0,0.0,1.0
1,0.347361,1.0,0.00,1.0,0.000000,1.0,0.822581,0.666667,0.466667,0.285714,0.30,0.461735,0.993324,0.634687,0.857143,0.166667,0.277778,1.0,1.0,0.0
2,0.041416,1.0,0.00,1.0,0.105263,1.0,0.629032,0.888889,0.541667,0.571429,0.30,0.644388,0.811749,0.627205,0.428571,0.111111,0.388889,1.0,1.0,0.0
3,0.076152,1.0,0.80,0.0,0.000000,0.0,0.129032,0.777778,0.425000,0.714286,0.45,0.620408,0.858478,0.671566,0.785714,0.444444,0.500000,1.0,0.0,0.0
4,0.881764,1.0,0.28,0.0,0.684211,1.0,0.677419,0.555556,0.508333,0.142857,0.70,0.616327,0.475300,0.308658,0.214286,0.111111,0.722222,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.195725,1.0,0.00,1.0,0.000000,1.0,0.000000,0.777778,0.216667,0.714286,0.70,0.623469,0.927904,0.110102,0.571429,0.222222,0.944444,1.0,1.0,0.0
1996,0.977956,1.0,0.84,1.0,0.000000,0.0,0.596774,0.111111,0.891667,0.428571,0.15,0.466837,0.977971,0.474613,0.428571,0.555556,0.777778,1.0,1.0,1.0
1997,0.941884,0.0,0.16,1.0,0.052632,1.0,0.548387,0.666667,0.233333,1.000000,0.15,0.442857,0.755674,0.748530,0.285714,0.055556,0.166667,1.0,1.0,0.0
1998,0.675351,0.0,0.16,0.0,0.210526,1.0,0.709677,0.000000,0.541667,0.571429,0.25,0.171429,0.113485,0.163816,0.928571,0.555556,0.944444,1.0,1.0,1.0


### 获取需要分类的属性值

In [8]:
y = data['price_range']
y

0       0
1       1
2       1
3       1
4       0
       ..
1995    0
1996    1
1997    1
1998    0
1999    1
Name: price_range, Length: 2000, dtype: int64

### 随机拆分数据，分为train部分、valid部分和test部分，比例0.8：0.1：0.1

In [18]:
x_train, x_tmp, y_train, y_tmp = train_test_split(x, y, test_size=0.2, random_state=101, stratify=y)
x_valid, x_test, y_vaild, y_test = train_test_split(x_tmp, y_tmp, test_size=0.5, random_state=101, stratify=y_tmp)

print(x_train.shape)
print(x_valid.shape)
print(x_test.shape)

(1600, 20)
(200,)
(200, 20)
(200, 20)


# 二.朴素贝叶斯
## （1）函数部分：

### 1.datadiscretize(dataSet)
离散化数据的函数;  
输入为除了需要分类的属性外的整个数据集，按列离散化数据;  
由于之前已经标准化了，每列最小为0，最大为1，所以直接使用等隔划分为10个区间;  
使用从0到9的labels来区分不同区间。

In [10]:
def datadiscretize(dataSet):
    """离散化数据"""
    for (columnName, columnData) in dataSet.iteritems():
        # print(pd.cut(columnData.values, 10, labels=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))
        dataSet.loc[:, columnName] = pd.cut(columnData.values, 10, labels=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

### 2.getpa(y, x_column)
获取具体一个属性的条件概率;  
输入为：y：训练集中的目标属性的值；x_column：训练集中某一列的属性值;  
先用crosstab函数获取频数表，然后除以总数，即为概率

In [11]:
def getpa(y, x_column):
    """获取某个特征属性的条件概率"""
    crosstab = pd.crosstab(y, x_column, margins=True)
    m, n = np.shape(crosstab)
    for i in range(m - 1):
        crosstab.loc[i] = crosstab.loc[i] / crosstab.loc[i].max()
    return crosstab

### 3.getallpa(y, x)  
获取每个特征属性的条件概率，储存在一个list中  
输入为：y：训练集中的目标属性的值； x_column：训练集中的其他值  
遍历每列调用getpa

In [12]:
def getallpa(y, x):
    """获取每个特征属性的条件概率"""
    allpas = []
    for (columnName, columnData) in x.iteritems():
        allpas.append(getpa(y, x[columnName]))
    return allpas

### 4.getprepa(x, allpa)
输入预测属性值，返回预测分类结果  
输入为：x：预测的属性值；allpa：之前得到的所有属性的条件概率

In [13]:
def getprepa(x, allpa):
    """获取预测结果"""
    m, n = np.shape(x)
    y_pred = {}
    for index, row in x.iterrows():
        p1 = 1
        p0 = 1
        for j in range(n):
            p1 = p1 * allpa[j].loc[1, row[j]]
            p0 = p0 * allpa[j].loc[0, row[j]]
        if p1 / pc[1] >= p0 / pc[0]:
            result = 1
        else:
            result = 0
        y_pred[index] = result
    return pd.Series(y_pred)

## （2）分类部分
先把需要的数据复制一份，供朴素贝叶斯分类使用

In [14]:
mnb_x_train = x_train.copy()
mnb_y_train = y_train.copy()
mnb_x_test = x_test.copy()
mnb_y_test = y_test.copy()

离散化训练数据和测试数据

In [15]:
datadiscretize(mnb_x_train)
datadiscretize(mnb_x_test)
mnb_x_train

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
1145,7,0,8,9,0,9,0,5,8,4,0,6,6,5,3,0,0,9,9,0
1792,8,0,6,9,0,0,2,1,1,4,3,2,2,3,3,4,4,9,9,0
88,1,0,0,9,0,0,0,0,6,5,6,3,4,6,2,2,3,9,9,9
345,1,9,3,0,0,9,8,6,9,4,1,8,9,2,8,2,6,9,0,0
721,2,0,0,9,0,0,0,0,9,4,0,3,9,9,7,4,5,9,0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1775,0,9,9,0,0,9,4,8,3,1,1,0,5,2,0,1,9,9,0,0
1284,3,0,5,0,1,9,1,2,6,5,2,6,7,4,4,4,4,9,0,0
821,9,9,8,9,3,9,4,4,1,1,4,2,1,9,7,8,9,9,9,9
1652,0,9,9,9,1,9,1,7,2,1,4,3,8,2,0,0,1,9,0,9


计算两种分类的各自的概率,并获取所有属性的条件概率（学习过程）

In [16]:
mnb_start_train = time.clock()
pc = mnb_y_train.value_counts() / mnb_y_train.size
pas = getallpa(mnb_y_train, mnb_x_train)
mnb_end_train = time.clock()
mnb_time_train = mnb_end_train - mnb_start_train

print(pc)
print(mnb_time_train)

AttributeError: module 'time' has no attribute 'clock'

根据获取的条件概率与测试数据，做出预测分类(预测过程)

In [17]:
mnb_start_pred = time.clock()
mnb_y_pred = getprepa(mnb_x_test, pas)
mnb_end_pred = time.clock()
mnb_time_pred = mnb_end_pred - mnb_start_pred
print(mnb_time_pred)
print(mnb_y_pred)

AttributeError: module 'time' has no attribute 'clock'

与正确结果比较，得到正确率

In [22]:
mnb_score_pred = metrics.accuracy_score(mnb_y_test, mnb_y_pred)
mnb_score_pred

0.93

# 三.逻辑回归
## （1）函数部分：

### 1.getsigmoidy(x, w_b)
计算L函数的结果；  
输入为：  
x：除了需要分类的属性外其它属性，在最后增加一列全为1的列，方便和常数项添加常数项b；  
w_b：拟合的W和b参数；  
使用sigmoid函数 1 / (1 + e^-z)

In [23]:
def getsigmoidy(x, w_b):
    """1 / (1 + e^-z)"""
    r1 = np.dot(x, w_b)
    r2 = [1 / (1 + np.e ** (-i)) for i in r1.flatten()]
    return np.array(r2).reshape(-1, 1)

### 2.getcost(y,p)
计算损失  
输入为：  
y:正确的分类属性值；p:计算出来的分类属性值  
当y=1时，使用损失函数log(p)；  
当y=0时，使用损失函数log(1-p).  
为了防止p = 0 或 1-p = 0 导致log函数出错，每个log里加了10^-5

In [24]:
def getcost(y, p):
    """计算y和p的损失"""
    return -(1 - y) * np.log(p + 1e-5) - y * np.log(1 - p + 1e-5)

### 3.getmeancost(x, y, w_b, C)
根据所给训练数据计算平均损失  
输入：C：正则化系数

In [25]:
def getmeancost(x, y, w_b, C):
    """计算平均损失"""
    p = getsigmoidy(x.values, w_b)
    p_array = p.flatten()
    y_array = y.values
    #正则化
    regular_item = (C / len(y_array) * 2) * np.dot(w_b[:-1].T, w_b[:-1])

    #return sum(getcost(y_array[i], p_array[i]) for i in range(len(y_array))) / len(y_array)
    return sum(getcost(y_array[i], p_array[i]) for i in range(len(y_array))) / len(y_array) + float(regular_item)

### 4.getgradient(x, y, w_b)
计算梯度

In [26]:
def getgradient(x, y, w_b):
    """计算梯度"""
    p = getsigmoidy(x.values, w_b)
    y_array = np.array(y.values).reshape(-1, 1)
    return np.dot(x.values.T, p - y_array) / (len(y_array))

### 5.getpredict(x, w_b)
获取预测结果

In [27]:
def getpredict(x, w_b):
    """获取预测结果"""
    x['intercept'] = 1.0
    p = getsigmoidy(x.values, w_b)
    y_pred = list(map(lambda x: 1 if x >= 0.5 else 0, p))
    return np.array(y_pred).reshape(-1, 1)

### 6.score(x, y, w_b)
获取的预测正确率

In [42]:
def score(y, y_pred):
    """获取预测正确率"""
    y, y_pred = y.values.flatten(), y_pred.flatten()
    correct_number = sum([True for i in range(len(y)) if y[i] == y_pred[i]])

    return correct_number/len(y)

## （2）分类部分  
先把需要的数据复制一份，供逻辑回归使用

In [29]:
lr_x_train = x_train.copy()
lr_y_train = y_train.copy()
lr_x_test = x_test.copy()
lr_y_test = y_test.copy()

给x在最后添加全为1的一列

In [30]:
lr_x_train['intercept'] = 1.0
lr_x_train

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,intercept
1145,0.757515,0.0,0.84,1.0,0.000000,1.0,0.000000,0.555556,0.850000,0.428571,...,0.616837,0.608144,0.550241,0.357143,0.055556,0.000000,1.0,1.0,0.0,1.0
1792,0.852371,0.0,0.68,1.0,0.000000,0.0,0.209677,0.111111,0.166667,0.428571,...,0.282143,0.249666,0.346339,0.357143,0.444444,0.444444,1.0,1.0,0.0,1.0
88,0.111556,0.0,0.00,1.0,0.000000,0.0,0.016129,0.000000,0.625000,0.571429,...,0.339796,0.469292,0.665420,0.285714,0.277778,0.333333,1.0,1.0,1.0,1.0
345,0.102204,1.0,0.32,0.0,0.000000,1.0,0.806452,0.666667,0.966667,0.428571,...,0.875000,0.933244,0.208445,0.857143,0.222222,0.611111,1.0,0.0,0.0,1.0
721,0.205077,0.0,0.00,1.0,0.000000,0.0,0.096774,0.000000,0.933333,0.428571,...,0.315306,0.928571,0.936932,0.714286,0.500000,0.555556,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1775,0.048764,1.0,1.00,0.0,0.052632,1.0,0.451613,0.888889,0.383333,0.142857,...,0.047959,0.556742,0.243453,0.000000,0.166667,0.944444,1.0,0.0,0.0,1.0
1284,0.327321,0.0,0.60,0.0,0.105263,1.0,0.161290,0.222222,0.650000,0.571429,...,0.616837,0.786382,0.469802,0.428571,0.500000,0.444444,1.0,0.0,0.0,1.0
821,0.923180,1.0,0.88,1.0,0.315789,1.0,0.451613,0.444444,0.125000,0.142857,...,0.270918,0.194259,0.926777,0.785714,0.833333,1.000000,1.0,1.0,1.0,1.0
1652,0.083500,1.0,0.92,1.0,0.157895,1.0,0.145161,0.777778,0.241667,0.142857,...,0.376020,0.840454,0.219401,0.071429,0.055556,0.111111,1.0,0.0,1.0,1.0


逻辑回归需要的一些常数：  
diff：终止逻辑回归每次迭代前后损失差的阈值；  
regular_C：正则化系数C

In [31]:
diff = 1e-4
regular_C = 1

随机生成拟合函数的w和b参数  
计算损失值，用两个变量保存当前的损失值(cur_meancost)和之前的损失值(pre_meancost,初始值为0)  
开始拟合,直到前后损失差小于diff(学习过程)

In [35]:
lr_start_train = time.clock()
w_b_pred = np.random.randn(lr_x_train.shape[1], 1)

meancost = getmeancost(lr_x_train, lr_y_train, w_b_pred, regular_C)
pre_meancost = 0
cur_meancost = meancost

while abs(pre_meancost - cur_meancost) > diff:
    grad = getgradient(lr_x_train, lr_y_train, w_b_pred)

    #w_b_pred = w_b_pred - alpha * grad
    w_b_pred = w_b_pred * (1 - regular_C / lr_x_train.shape[0]) - grad

    meancost = getmeancost(lr_x_train, lr_y_train, w_b_pred, regular_C)

    pre_meancost = cur_meancost
    cur_meancost = meancost

lr_end_train = time.clock()
lr_time_train = lr_end_train - lr_start_train

print(lr_time_train)
print(meancost)
print(w_b_pred[:-1].flatten())
print(w_b_pred[-1].flatten())

94.60970700000001
3.346196506623079
[ 2.55312432 -0.06543612 -0.24614025 -0.20605761  0.04274361 -0.21319599
 -0.27946921 -0.59138104 -0.52878421  0.17171719 -0.29032137  1.61454356
  1.6205179  11.62778734 -0.36988378  0.14354952 -0.28472789  0.0230669
 -0.27642272 -0.230317  ]
[-6.71947081]


获取预测结果(预测过程）

In [39]:
lr_start_pred = time.clock()

lr_y_pred = getpredict(lr_x_test, w_b_pred)

lr_end_pred = time.clock()
lr_time_pred = lr_end_pred - lr_start_pred
print(lr_time_pred)
print(lr_y_pred.flatten())

0.0018769999999790343
[0 1 1 0 1 1 1 1 0 0 1 0 1 1 1 0 0 1 1 1 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 1 0
 1 1 0 0 1 0 1 0 1 1 1 0 0 0 0 1 0 1 1 1 0 0 1 1 1 1 0 0 0 1 1 1 1 1 1 0 1
 1 1 0 1 1 1 0 1 1 1 0 0 1 0 0 0 1 0 1 0 1 1 0 1 1 0 1 0 0 1 0 1 1 1 0 0 0
 0 0 1 0 1 1 0 1 1 1 0 0 1 0 0 1 0 0 1 1 0 0 1 0 1 1 0 1 1 0 0 1 0 1 0 0 0
 1 0 0 1 0 0 1 1 0 0 1 1 0 1 1 1 1 0 0 0 1 0 0 1 0 0 1 1 1 0 0 0 0 0 1 0 0
 1 1 0 0 0 0 1 0 1 0 0 1 1 1 0]


获取预测结果正确率

In [43]:
lr_score_test = score(lr_y_test, lr_y_pred)
lr_score_test

0.985

# 四.SVM
使用sklearn库中的LinearSVC()

In [45]:
sll_x_train = x_train.copy()
sll_y_train = y_train.copy()
sll_x_test = x_test.copy()
sll_y_test = y_test.copy()
sll = LinearSVC()

学习过程

In [47]:
sll_start_train_timer = time.clock()
sll.fit(sll_x_train, sll_y_train)
sll_end_train = time.clock()
sll_time_train = sll_end_train - sll_start_train

sll_time_train

0.01386200000001736

预测过程

In [48]:
sll_start_pred = time.clock()
sll_y_pred = sll.predict(sll_x_test)
sll_end_pred = time.clock()
sll_time_pred = sll_end_pred - sll_start_pred
print(sll_time_pred)
print(sll_y_pred)

0.0035110000000031505
[0 1 1 0 1 1 1 0 0 0 1 0 1 1 1 0 0 1 1 1 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 1 0
 1 1 0 0 1 0 1 0 1 1 1 0 0 0 0 1 0 1 1 1 0 0 1 1 1 1 0 0 0 1 1 1 1 1 1 0 1
 1 1 0 1 1 1 0 1 1 1 0 0 1 0 0 0 1 0 1 0 1 1 0 1 1 0 1 0 0 1 0 1 0 1 0 0 0
 0 0 1 0 1 1 0 1 1 1 0 0 1 0 0 1 0 0 1 1 0 0 1 0 1 1 0 1 1 0 0 1 0 1 0 0 0
 1 0 0 1 0 0 1 1 0 0 1 1 0 1 1 1 1 0 0 0 1 0 0 1 0 0 1 1 1 0 0 0 0 0 1 0 0
 1 1 0 0 0 0 1 0 1 0 0 1 1 1 0]


In [49]:
confusion_matrix = metrics.confusion_matrix(sll_y_test, sll_y_pred)
confusion_matrix

array([[ 99,   1],
       [  0, 100]])

In [50]:
sll_score_test = metrics.accuracy_score(sll_y_test, sll_y_pred)
sll_score_test

0.995

# 五.总结
## （1）花费时间

In [54]:
alltimes = [["朴素贝叶斯分类",mnb_time_train, mnb_time_pred], 
            ["逻辑回归",lr_time_train, lr_time_pred],
            ["SVM",sll_time_train, sll_time_pred]]
timetable = pd.DataFrame(alltimes, columns=['类型','time_train(s)', 'time_pred(s)'])
timetable

Unnamed: 0,类型,time_train(s),time_pred(s)
0,朴素贝叶斯分类,1.362726,0.363276
1,逻辑回归,94.609707,0.001877
2,SVM,0.013862,0.003511
