# Regression

In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import math
import pickle
import time


## Tool Function

### 准确率和混淆矩阵

In [2]:
'''
通过矩阵方法计算预测正确率和混淆矩阵
'''
# 通过矩阵计算正确率
def calPrecRate(predResult, origResult):
    predResult = np.matrix(predResult)
    origResult = np.matrix(origResult)
    m, _n = predResult.shape

    return float(sum(predResult == origResult)) / m

# 通过矩阵计算计算混淆矩阵
def calConfMatrix(predResult, origResult):
    predResult = np.matrix(predResult)
    origResult = np.matrix(origResult)
    m, _n = predResult.shape

    # 通过计算准确率的方式计算混淆矩阵的四个值，最终返回混淆矩阵（pd.DataFrame）和正确率
    zeroZero = calPrecRate(predResult + origResult, np.zeros((m, 1))) * m        # 0 + 0 = 0
    zeroOne = calPrecRate(predResult - origResult, np.ones((m, 1))) * m          # 1 - 0 = 1
    oneZero = calPrecRate(origResult - predResult, np.ones((m, 1))) * m          # 1 - 0 = 1
    oneOne = calPrecRate(predResult + origResult, 2 * np.ones((m, 1))) * m       # 1 + 1 = 2

    confMatrix = pd.DataFrame([[oneOne, oneZero], [zeroOne, zeroZero]], columns=['one', 'zero'], index=['one', 'zero'])
    accuracy = (zeroZero + oneOne) / (zeroZero + oneOne + zeroOne + oneZero)
    return confMatrix, accuracy

# 通过混淆矩阵计算准确率、召回率和F1-Score
def calPRF(confMatrix):
    precision = confMatrix.iloc[0, 0] / (confMatrix.iloc[0, 0] + confMatrix.iloc[1, 0])
    recall = confMatrix.iloc[0, 0] / (confMatrix.iloc[0, 0] + confMatrix.iloc[0, 1])
    f1 = 2 * precision * recall / (precision + recall)
    return precision, recall, f1

### Error Rate

In [3]:
'''
计算Error Rate
'''
def calErrorRate(errorArray):
    errorVec = np.array(errorArray).reshape((1, -1))
    errorVec = np.abs(errorVec)
    return errorVec.sum() / errorVec.shape[1]

### Smooth Line

In [4]:
'''
按照一定步长对lst进行采样
'''
def smoothLine(lst, step):
    result = []
    for i in range(len(lst) // step + 1):
        lstF = lst[i * step:(i + 1) * step]
        if len(lstF) > 0:
            av = np.average(lstF)
            var = np.var(lstF) ** 0.5
            result.append([i * step, av, av - var, av + var])
    return result

## Logistic Regression代码实现

### Sigmoid函数

In [5]:
def sigmoid(inX):
    return 1 / (1 + np.exp(-inX))

# 避免上溢报错问题
def sigmoid(inX):
    x_shape = inX.shape                                      # 获取输入矩阵形状
    x = np.array(inX).ravel()                                # 铺平
    y = []
    for i in range(len(x)):
        if  x[i] >= 0:
            y.append(1 / (1 + np.exp(-x[i])))
        else:
            y.append(np.exp(x[i]) / (1 + np.exp(x[i])))      # 当某一个元素小于0时，用另一个公式计算，解决上溢问题
    return np.matrix(np.array(y).reshape(inX.shape))

### 批量梯度上升算法

In [6]:
'''
批量Logistic Regression梯度上升算法
'''
def batchGradAscent(dataMatIn, classLabels, maxCycles=2500):
    # 将输入转化为np的matrix数据类型
    dataMatrix = np.matrix(dataMatIn, copy=True)
    labelMat = np.matrix(classLabels)

    m, n = dataMatrix.shape
    weights = np.ones((n, 1))                                  # 将权重初始化为1

    # 循环进行梯度上升，最多循环maxCycles次
    for k in range(maxCycles):
        alpha = 1 / (1 + k)
        h = sigmoid(dataMatrix * weights)                      # 计算当前weights下logistic regression分类结果
        error = (labelMat - h)                                 # 计算当前weights下的误差
        weights += alpha * dataMatrix.transpose() * error      # 梯度上升
    return weights

### 随机梯度上升算法

In [7]:
'''
随机梯度上升算法
'''
def stocGradAscent(dataMatIn, classLabels, alpha=0.01):
    # 将输入转化为np的matrix数据类型
    dataMatrix = np.matrix(dataMatIn, copy=True)
    labelMat = np.matrix(classLabels).transpose()

    m, n = dataMatrix.shape
    weights = np.ones((n, 1)) # 生成初始weights
    # 遍历训练集的每一行
    for i in range(m):
        test = dataMatrix[i]
        h = sigmoid(dataMatrix[i] * weights)
        error = float(labelMat[i] - h)
        weights += alpha * error * dataMatrix[i].reshape((n, 1))
    return weights


In [8]:
'''
改进的随机梯度上升算法
相比起原函数，设置了更新全样本的论数：numIter，即更新numIter轮，每一轮更新m次，一次根据一条数据进行更新
每次随机从数据集中抽取一条数据更新weights，且不可重复
'''
def stocGradAscentImproved(dataMatIn, classLabels, numIter=2500):
    # 将输入转化为np的matrix数据类型
    dataMatrix = np.matrix(dataMatIn, copy=True)
    labelMat = list(classLabels.ravel())
    m, n = dataMatrix.shape
    
    weights = np.ones((n, 1))                                                   # 生成初始weights
    # 外循环：numIter次
    for j in range(numIter):
        dataIndex = list(range(m))
        # 内循环：dataMatrix行数
        for i in range(m):
            alpha = 4 / (4.0 + j + i)                                           # 随着轮数的增加，更新的步幅变小
            randIndex = dataIndex.pop(random.randint(0, len(dataIndex) - 1))    # 获取dataMatrix中的随机一行
            # 更新weights                                                        #（不放回抽样）
            h = sigmoid(dataMatrix[randIndex] * weights)
            error = float(labelMat[randIndex] - h)
            weights += alpha * error * dataMatrix[randIndex].reshape((n, 1))
    return weights

### 小批量随机梯度上升

In [9]:
def checkGradThreshold(grad, threshold):
    '''
    当有一个元素比阈值大，就返回True
    '''
    check = np.abs(grad) - threshold
    if True in (check > 0).ravel():
        return True
    else:
        return False

def miniBatchGradAscent(dataMatIn, classLabels, batchSize=10, gradThreshold=1e-10, \
    jThreshold=0.9, maxCycles=2000):
    # 初始化参数
    m, n = dataMatIn.shape
    dataMatrix = np.matrix(dataMatIn)
    labelMat = np.matrix(np.array(classLabels).reshape((m, 1)))
    weights = np.matrix(np.ones((n, 1)))
    dataIndex = list(range(len(dataMatrix)))
    
    # 最多循环maxCycles次
    for i in range(maxCycles):
        dataIndex = list(range(m))
        random.shuffle(dataIndex)
        errorRate = 0
        # 获取随机小批量数据
        for j in range(math.floor(m / batchSize) + 1):
            randomIndex = dataIndex[batchSize * j:batchSize * j + batchSize]  # b，从数据中抽取batchSize行用于更新数据
            if len(randomIndex) > 0:
                alpha = 4 / (4 + i)                                           # 随着轮数的增加，更新的步幅变小
                batchData = dataMatrix[randomIndex]                           # b * n
                batchLabels = labelMat[randomIndex]                           # b * 1

                # 用小批量数据更新weights
                h = sigmoid(batchData * weights)                              # b * 1
                error = batchLabels - h                                       # b * 1
                errorRate += np.abs(error).sum()
                gradient = batchData.transpose() * error                      # n * 1
                if not checkGradThreshold(gradient, gradThreshold):
                    return weights
                weights += alpha * gradient
        
        if 1 - errorRate / m > jThreshold:
            return weights
    return weights

In [10]:
'''
分类函数，根据回归得到的weights计算输入的分类结果
'''
# 对行分类
def classifyVector(inX, weights):
    prob = float(sigmoid(inX.T * weights))
    if prob > 0.5:
        return 1
    else:
        return 0

# 对矩阵分类，用矩阵运算，比迭代调用classifyVector快很多
def classifyMatrix(inX, weights):
    inX = np.matrix(inX)
    weights = np.matrix(weights)
    probs = sigmoid(inX * weights)
    return np.round(probs)


In [11]:
'''
使用改进的随机梯度上升进行多次预测并取平均值
'''
def stocMultiTest(trainDataSet, trainLabels, testDataSet, testLabels, roundNum=10):
    precAvg = 0
    origResult = np.matrix(testLabels.values)
    confMatrixAvg = pd.DataFrame(columns=['one', 'zero'], index=['one', 'zero']).fillna(0)       
    for i in range(roundNum):
        trainWeights = stocGradAscentImproved(trainDataSet.values, trainLabels.values, numIter=500)      # 使用改进的随机梯度上升训练模型
        predResult = classifyMatrix(testDataSet.values, trainWeights)                                    # 使用logistic regression回归模型预测测试集

        confMatrix, accuracy = calConfMatrix(predResult, origResult)       
        confMatrixAvg += confMatrix
        precAvg += accuracy
    
    confMatrixAvg /= roundNum
    precAvg /= roundNum
    
    return confMatrixAvg, precAvg

In [12]:
'''
使用小批量随机梯度上升进行多次预测并取平均值
'''
def miniBatchMultiTest(trainDataSet, trainLabels, testDataSet, testLabels, roundNum=10, **miniBatchParameters):
    accuracyAvg = 0
    origResult = np.matrix(testLabels.values)
    confMatrixAvg = pd.DataFrame(columns=['one', 'zero'], index=['one', 'zero']).fillna(0)       
    for i in range(roundNum):
        trainWeights = miniBatchGradAscent(trainDataSet.values, trainLabels.values, **miniBatchParameters)    # 使用改进的随机梯度上升训练模型
        predResult = classifyMatrix(testDataSet.values, trainWeights)                                         # 使用logistic regression回归模型预测测试集
        
        confMatrix, accuracy = calConfMatrix(predResult, origResult)
        confMatrixAvg += confMatrix
        accuracyAvg += accuracy
    
    confMatrixAvg /= roundNum
    accuracyAvg /= roundNum
    
    return confMatrixAvg, accuracyAvg

## 重新清洗数据集并进行Logistic Regression

### 导入原始数据

In [13]:
trainDataSet = []
testDataSet = []
def cleanBlank(lst):
    while '' in lst:
        lst.remove('')
    return lst
def repUnknown(lst):
    while '?' in lst:
        id = lst.index('?')
        lst[id] = None
    return lst
with open('./Data/horse-colic.data', 'r') as f:
    for row in f.readlines():
        if row[-1] == '\n':
            row = row[:-1]
        row = row.split(' ')
        row = cleanBlank(row)
        row = repUnknown(row)
        trainDataSet.append(row)

with open('./Data/horse-colic.test', 'r') as f:
    for row in f.readlines():
        if row[-1] == '\n':
            row = row[:-1]
        row = row.split(' ')
        row = cleanBlank(row)
        row = repUnknown(row)
        testDataSet.append(row)

columns = ['surgery', 'age', 'hospital_number', 'rectal_temperature', 'pulse', \
    'respiratory_rate', 'temperature_extremities', 'peripheral_pulse', 'mucous_membranes', \
    'capllary_refill_time', 'pain', 'peristalsis', 'abdominal_distension', 'nasogastric_tube', \
    'nasogastric_reflux', 'nasogastric_reflux_PH', 'rectal_examination_feces', 'abdomen', \
    'packed_cell_volume', 'total_protein', 'abdominocentesis_appearance', \
    'abdomcentesis_total_protein', 'outcome', 'surgical_lesion', 'lesion_1', 'lesion_2', \
    'lesion_3', 'cp_data']
trainDataSet = pd.DataFrame(trainDataSet, columns=columns).astype('float')
testDataSet = pd.DataFrame(testDataSet, columns=columns).astype('float')
trainDataSet.head(3)

Unnamed: 0,surgery,age,hospital_number,rectal_temperature,pulse,respiratory_rate,temperature_extremities,peripheral_pulse,mucous_membranes,capllary_refill_time,...,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,2.0,1.0,530101.0,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2.0,11300.0,0.0,0.0,2.0
1,1.0,1.0,534817.0,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2.0,2208.0,0.0,0.0,2.0
2,2.0,1.0,530334.0,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2.0,0.0,0.0,0.0,1.0


### 无效数据删除、选取特征和顺序变换

In [14]:
'''
清理数据：
删除surgical_lesion列为空的数据

需要被删掉的属性： 
1. Hospital Number: 马的号码
2. outcome
3. lession 1
4. lession 2
5. lession 3
6. cp_data
需要改变顺序的属性：
1. temperature of extremities
2. peripheral pulse
3. rectal examination - feces
4. surgical_lesion
'''

# 删除surgical_lesion为空的数据
trainDataSet = trainDataSet[trainDataSet.outcome == trainDataSet.outcome] # 这里利用了np.nan的特性
testDataSet = testDataSet[testDataSet.outcome == testDataSet.outcome]

# 删除不必要的属性 
for column in ['hospital_number', 'surgical_lesion', 'lesion_1', 'lesion_2', 'lesion_3', 'cp_data']:
    if column in columns:
        columns.remove(column)
trainDataSet = trainDataSet.loc[:, columns]
testDataSet = testDataSet.loc[:, columns]

# 改变部分离散属性的排列属性
tEMap = {
    4 : 1, 
    3 : 2, 
    1 : 3, 
    2 : 4, 
}
pPMap = {
    4 : 1, 
    3 : 2, 
    1 : 3, 
    2 : 4, 
}
rEFMap = {
    4 : 1, 
    3 : 2, 
    1 : 3, 
    2 : 4, 
}
oCMap = {
    1 : 1, 
    2 : 0, 
    3 : 0, 
}
def transformAttributes(dataframe):
    dataframe['temperature_extremities'] = dataframe['temperature_extremities'].map(lambda x:tEMap[x] if x == x else x)
    dataframe['peripheral_pulse'] = dataframe['peripheral_pulse'].map(lambda x:pPMap[x] if x == x else x)
    dataframe['rectal_examination_feces'] = dataframe['rectal_examination_feces'].map(lambda x:rEFMap[x] if x == x else x)
    dataframe['outcome'] = dataframe['outcome'].map(lambda x:oCMap[x] if x == x else x)
    return dataframe

trainDataSet = transformAttributes(trainDataSet)
testDataSet = transformAttributes(testDataSet)
trainDataSet.head(3)

Unnamed: 0,surgery,age,rectal_temperature,pulse,respiratory_rate,temperature_extremities,peripheral_pulse,mucous_membranes,capllary_refill_time,pain,...,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_PH,rectal_examination_feces,abdomen,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome
0,2.0,1.0,38.5,66.0,28.0,2.0,2.0,,2.0,5.0,...,,,,2.0,5.0,45.0,8.4,,,0
1,1.0,1.0,39.2,88.0,20.0,,,4.0,1.0,3.0,...,,,,1.0,2.0,50.0,85.0,2.0,2.0,0
2,2.0,1.0,38.3,40.0,24.0,3.0,3.0,3.0,1.0,3.0,...,,,,3.0,1.0,33.0,6.7,,,1


### 缺省值处理（均值填充法）

In [15]:
class MeanImputer(object):
    def __init__(self, discreteAtts=[]):
        self.cLabels = set()
        self.meanValue = dict()
        self.strictMeanValue = dict()
        self.columns = list()
        self.classLabel = None
        self.discreteAtts = set(discreteAtts)
        self.discreteAttValues = {att:None for att in discreteAtts}
    
    @staticmethod
    def findNeighbour(value, valueLst):
        valueLst = np.abs(np.array(valueLst) - value)
        value = float(np.min(valueLst)) + value
        return value

    def fit(self, dataframe, classLabel=None, discreteIndex=None):
        if classLabel == None:
            classLabel = dataframe.columns[-1]
        self.classLabel = classLabel
        self.cLabels = set(dataframe[classLabel].unique().tolist())
        self.meanValue = {label:dict() for label in self.cLabels}
        self.columns = dataframe.columns.tolist()
        for label in self.cLabels:
            subSet = dataframe[dataframe[classLabel] == label]
            for i, column in enumerate(self.columns):
                if discreteIndex is not None and i in discreteIndex:
                    self.meanValue[label][column] = subSet[column].mean()
                else:
                    self.meanValue[label][column] = subSet[column].value_counts().idxmax()
        self.strictMeanValue = {column:dataframe.loc[:, column].mean() for column in columns}

        if len(self.discreteAtts) > 0:
            for column in self.discreteAtts:
                valueList = dataframe[column].dropna().astype('float').unique().tolist()
                self.discreteAttValues[column] = valueList
    
    def easeTransform(self, dataframe):
        dataframe = dataframe.copy()
        if len(self.meanValue) > 0:
            subSets = []
            for label in self.cLabels:
                subSet = dataframe[dataframe[self.classLabel] == label]
                for column in self.columns:
                    if column not in self.discreteAtts:
                        subSet[column] = subSet[column].fillna(self.meanValue[label][column])
                    else:
                        subSet[column] = subSet[column].fillna(self.findNeighbour(self.meanValue[label][column], self.discreteAttValues[column]))
                subSets.append(subSet)
            return pd.concat(subSets, axis=0)
        else:
            print('You have not fit the dataset yet')
    
    def strictTransform(self, dataframe):
        dataframe = dataframe.copy()
        if len(self.meanValue) > 0:
            for column in self.columns:
                if column not in self.discreteAtts:
                    dataframe[column] = dataframe[column].fillna(self.strictMeanValue[column])
                else:
                    dataframe[column] = dataframe[column].fillna(self.findNeighbour(self.strictMeanValue[column], self.discreteAttValues[column]))
            return dataframe
        else:
            print('You have not fit the dataset yet')

# meanImputer = MeanImputer(discreteAtts=['mucous_membranes', 'abdomen'])
meanImputer = MeanImputer()
discreteAttIndex = [0, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 19, 21]
meanImputer.fit(trainDataSet, classLabel='outcome', discreteIndex=discreteAttIndex)
trainDataSet = meanImputer.easeTransform(trainDataSet)
testDataSet = meanImputer.easeTransform(testDataSet)
trainDataSet.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subSet[column] = subSet[column].fillna(self.meanValue[label][column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subSet[column] = subSet[column].fillna(self.meanValue[label][column])


Unnamed: 0,surgery,age,rectal_temperature,pulse,respiratory_rate,temperature_extremities,peripheral_pulse,mucous_membranes,capllary_refill_time,pain,...,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_PH,rectal_examination_feces,abdomen,packed_cell_volume,total_protein,abdominocentesis_appearance,abdomcentesis_total_protein,outcome
0,2.0,1.0,38.5,66.0,28.0,2.0,2.0,3.72549,2.0,5.0,...,1.759494,1.817073,7.0,2.0,5.0,45.0,8.4,2.366667,2.0,0
1,1.0,1.0,39.2,88.0,20.0,2.072165,2.197802,4.0,1.0,3.0,...,1.759494,1.817073,7.0,1.0,2.0,50.0,85.0,2.0,2.0,0
3,1.0,9.0,39.1,164.0,84.0,1.0,3.0,6.0,2.0,2.0,...,1.0,2.0,5.0,2.0,4.236842,48.0,7.2,3.0,5.3,0


### 离散属性连续化

In [16]:
'''
离散属性连续化：
需要连续化的离散属性：
1. mucous membranes
2. abdomen
'''
def serialize(dataframe):
    dataframe['MM_fastCirculation'] = dataframe['mucous_membranes'].map(lambda x:1 if x == 1 or x == 2 else 0)
    dataframe['MM_seriousCirculation'] = dataframe['mucous_membranes'].map(lambda x:1 if x == 4 or x == 6 else 0)
    dataframe['MM_shock'] = dataframe['mucous_membranes'].map(lambda x:1 if x == 3 else 0)
    dataframe['MM_septicemia'] = dataframe['mucous_membranes'].map(lambda x:1 if x == 5 else 0)
    dataframe.drop('mucous_membranes', axis=1, inplace=True)

    dataframe['ABD_normal'] = dataframe['abdomen'].map(lambda x:1 if x == 1 else 0)
    dataframe['ABD_other'] = dataframe['abdomen'].map(lambda x:1 if x == 2 else 0)
    dataframe['ABD_fece'] = dataframe['abdomen'].map(lambda x:1 if x == 3 else 0)
    dataframe['ABD_surgicalLession'] = dataframe['abdomen'].map(lambda x:1 if x == 4 or x ==5 else 0)
    dataframe.drop('abdomen', axis=1, inplace=True)
    return dataframe

trainDataSet = serialize(trainDataSet)
testDataSet = serialize(testDataSet)
trainDataSet.head(5)

Unnamed: 0,surgery,age,rectal_temperature,pulse,respiratory_rate,temperature_extremities,peripheral_pulse,capllary_refill_time,pain,peristalsis,...,abdomcentesis_total_protein,outcome,MM_fastCirculation,MM_seriousCirculation,MM_shock,MM_septicemia,ABD_normal,ABD_other,ABD_fece,ABD_surgicalLession
0,2.0,1.0,38.5,66.0,28.0,2.0,2.0,2.0,5.0,4.0,...,2.0,0,0,0,0,0,0,0,0,1
1,1.0,1.0,39.2,88.0,20.0,2.072165,2.197802,1.0,3.0,4.0,...,2.0,0,0,1,0,0,0,1,0,0
3,1.0,9.0,39.1,164.0,84.0,1.0,3.0,2.0,2.0,4.0,...,5.3,0,0,1,0,0,0,0,0,0
4,2.0,1.0,37.3,104.0,35.0,2.072165,2.197802,2.0,3.56,3.323232,...,2.0,0,0,1,0,0,0,0,0,0
7,1.0,1.0,38.0,60.0,24.0,2.0,2.197802,1.0,3.56,4.0,...,2.0,0,0,0,0,0,0,0,0,1


### 归一化处理

In [17]:
'''
归一化处理
'''
class MinMaxNormalization(object):
    def __init__(self):
        self.maxDict = dict()
        self.minDict = dict()
        self.columns = np.array([])

    def fit(self, dataDF):
        self.__init__()
        self.columns = dataDF.columns
        for column in self.columns:
            self.maxDict[column] = dataDF[column].max()
            self.minDict[column] = dataDF[column].min()
    
    def transform(self, dataDF):
        for column in dataDF:
            dataDF[column] = (dataDF[column] - self.minDict[column]) / (self.maxDict[column] - self.minDict[column])
        return dataDF
    
    def fitTransform(self, dataDF):
        self.fit(dataDF)
        return self.transform(dataDF)

minMaxNormalization = MinMaxNormalization()
trainDataSet = minMaxNormalization.fitTransform(trainDataSet)
testDataSet = minMaxNormalization.fitTransform(testDataSet)
testDataSet.head(3)

Unnamed: 0,surgery,age,rectal_temperature,pulse,respiratory_rate,temperature_extremities,peripheral_pulse,capllary_refill_time,pain,peristalsis,...,abdomcentesis_total_protein,outcome,MM_fastCirculation,MM_seriousCirculation,MM_shock,MM_septicemia,ABD_normal,ABD_other,ABD_fece,ABD_surgicalLession
3,0.0,0.0,0.27027,0.189655,0.162791,0.333333,0.666667,1.0,0.75,1.0,...,0.15493,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
10,1.0,0.0,0.810811,0.431034,0.023256,0.333333,0.666667,0.0,0.25,1.0,...,0.183099,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
12,0.0,0.0,0.540541,0.913793,0.162791,0.357388,0.399267,1.0,1.0,1.0,...,0.15493,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


### 增加$x_0$列

In [18]:
trainDataSet['b'] = 1
testDataSet['b'] = 1
trainDataSet.head(3)

Unnamed: 0,surgery,age,rectal_temperature,pulse,respiratory_rate,temperature_extremities,peripheral_pulse,capllary_refill_time,pain,peristalsis,...,outcome,MM_fastCirculation,MM_seriousCirculation,MM_shock,MM_septicemia,ABD_normal,ABD_other,ABD_fece,ABD_surgicalLession,b
0,1.0,0.0,0.574074,0.233766,0.227273,0.333333,0.333333,0.5,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,0.0,0.0,0.703704,0.376623,0.136364,0.357388,0.399267,0.0,0.5,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1
3,0.0,1.0,0.685185,0.87013,0.863636,0.0,0.666667,0.5,0.25,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1


### 分离训练集和测试集

In [19]:
'''
分离数据和Labels
'''
# 处理训练集
trainLabels = trainDataSet.loc[:,['outcome']].rename({'outcome':'Labels'}, axis=1)
trainDataSet.drop('outcome', axis=1, inplace=True)

# 处理测试集
testLabels = testDataSet.loc[:,['outcome']].rename({'outcome':'Labels'}, axis=1)
testDataSet.drop('outcome', axis=1, inplace=True)

trainDataSet.head(3)

Unnamed: 0,surgery,age,rectal_temperature,pulse,respiratory_rate,temperature_extremities,peripheral_pulse,capllary_refill_time,pain,peristalsis,...,abdomcentesis_total_protein,MM_fastCirculation,MM_seriousCirculation,MM_shock,MM_septicemia,ABD_normal,ABD_other,ABD_fece,ABD_surgicalLession,b
0,1.0,0.0,0.574074,0.233766,0.227273,0.333333,0.333333,0.5,1.0,1.0,...,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,0.0,0.0,0.703704,0.376623,0.136364,0.357388,0.399267,0.0,0.5,1.0,...,0.19,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1
3,0.0,1.0,0.685185,0.87013,0.863636,0.0,0.666667,0.5,0.25,1.0,...,0.52,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1


### 导出和备份数据

In [20]:
'''
导出&备份数据
'''
with open('./Cache/data_outcome.pkl', 'wb+') as f:
    pickle.dump((trainDataSet.values, trainLabels.values, testDataSet.values, testLabels.values, trainDataSet.columns), f)

## 预测疝气病（重新处理的数据）

### 批量梯度上升

In [21]:
'''
使用批量梯度上升进行预测
'''
trainWeights = batchGradAscent(trainDataSet.values, trainLabels.values)      # 使用改进的随机梯度上升训练模型
predResult = classifyMatrix(testDataSet.values, trainWeights)                                    # 使用logistic regression回归模型预测测试集
origResult = np.matrix(testLabels.values)                                                        # 获取正确答案

confMatrix, precRate = calConfMatrix(predResult, origResult, )                                        # 计算混淆矩阵和正确率
print('使用改进的随机梯度上升进行Logistic Regression分类的准确率有{:.03f}%'.format(precRate * 100))

precision, recall, f1 = calPRF(confMatrix)
print('精确率：{:.03f}%'.format(precision * 100))
print('召回率：{:.03f}%'.format(recall * 100))
print('f1：{:.03f}'.format(f1))
confMatrix

使用改进的随机梯度上升进行Logistic Regression分类的准确率有89.552%
准确率：95.455%
召回率：89.362%
f1：0.923


Unnamed: 0,one,zero
one,42.0,5.0
zero,2.0,18.0


### 随机梯度上升

In [22]:
'''
使用改进的随机梯度上升进行多次预测并取平均值
'''
roundNum = 5
confMatrix, precRate = stocMultiTest(trainDataSet, trainLabels, testDataSet, testLabels, roundNum)
print('通过迭代运算{}次，使用改进的随机梯度上升进行Logistic Regression分类的准确率有{:.03f}%'.format(roundNum, precRate * 100))
precision, recall, f1 = calPRF(confMatrix)
print('精确率：{:.03f}%'.format(precision * 100))
print('召回率：{:.03f}%'.format(recall * 100))
print('f1：{:.03f}'.format(f1))
confMatrix

通过迭代运算5次，使用改进的随机梯度上升进行Logistic Regression分类的准确率有92.537%
准确率：97.727%
召回率：91.489%
f1：0.945


Unnamed: 0,one,zero
one,43.0,4.0
zero,1.0,19.0


### 小批量随机梯度上升

In [36]:
roundNum = 5
miniBatchParameters = {
    'batchSize':50, 
    'gradThreshold':-1,  
    'jThreshold':1.1, 
    'maxCycles':2500, 
}
confMatrix, precRate = miniBatchMultiTest(trainDataSet, trainLabels, testDataSet, testLabels, roundNum, **miniBatchParameters)
precision, recall, f1 = calPRF(confMatrix)
print('通过迭代运算{}次，使用改进的小批量梯度上升进行Logistic Regression分类的准确率有{:.03f}%'.format(roundNum, precRate * 100))
print('精确率：{:.03f}%'.format(precision * 100))
print('召回率：{:.03f}%'.format(recall * 100))
print('f1：{:.03f}'.format(f1))
confMatrix

通过迭代运算5次，使用改进的小批量梯度上升进行Logistic Regression分类的准确率有87.463%
准确率：93.274%
召回率：88.511%
f1：0.908


Unnamed: 0,one,zero
one,41.6,5.4
zero,3.0,17.0


### 交叉验证

In [24]:
wholeDataSet = pd.concat([trainDataSet, testDataSet], axis=0)
wholeLabels = pd.concat([trainLabels, testLabels], axis=0)
wholeDataSet.head(3)

Unnamed: 0,surgery,age,rectal_temperature,pulse,respiratory_rate,temperature_extremities,peripheral_pulse,capllary_refill_time,pain,peristalsis,...,abdomcentesis_total_protein,MM_fastCirculation,MM_seriousCirculation,MM_shock,MM_septicemia,ABD_normal,ABD_other,ABD_fece,ABD_surgicalLession,b
0,1.0,0.0,0.574074,0.233766,0.227273,0.333333,0.333333,0.5,1.0,1.0,...,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,0.0,0.0,0.703704,0.376623,0.136364,0.357388,0.399267,0.0,0.5,1.0,...,0.19,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1
3,0.0,1.0,0.685185,0.87013,0.863636,0.0,0.666667,0.5,0.25,1.0,...,0.52,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [31]:
# kFold
def kFold(dataSet, labels, k, randomShuffle=True):
    dataSet = pd.concat([dataSet, labels], axis=1)
    labelName = dataSet.columns[-1]
    attNames = dataSet.columns.drop(labelName)
    index = set(range(len(dataSet)))
    dataSet.sort_values(by=labelName, inplace=True)

    if randomShuffle == True:
        labelSubSet = {label:dataSet[dataSet[labelName] == label] for label in dataSet[labelName].unique()}
        labelSubSet = [labelSubSet[label].iloc[np.random.permutation(len(labelSubSet[label]))] for label in labelSubSet]
        dataSet = pd.concat(labelSubSet, axis=0)

    for i in range(k):
        testIndex = set(range(i, len(dataSet), k))
        trainSubSet = dataSet.iloc[list(index - testIndex)]
        testSubSet = dataSet.iloc[list(testIndex)]
        yield trainSubSet[attNames], trainSubSet[[labelName]], testSubSet[attNames], testSubSet[[labelName]]

k = 10
miniBatchParameters = {
    'batchSize':50, 
    'gradThreshold':-1,  
    'jThreshold':1.1, 
    'maxCycles':2500, 
}
roundNum = 5
confMatrixAvg = pd.DataFrame(columns=['one', 'zero'], index=['one', 'zero']).fillna(0)
precRateAvg = 0
for trainSubData, trainSubLabels, testSubData, testSubLabels in kFold(wholeDataSet, wholeLabels, k):
    confMatrix, precRate = miniBatchMultiTest(trainSubData, trainSubLabels, testSubData, testSubLabels, roundNum, **miniBatchParameters)
    confMatrixAvg += confMatrix
    precRateAvg += precRate

confMatrixAvg /= k
precRateAvg = precRateAvg / k
print('经过{}-折交叉验证，发现小批量梯度上升的Logistic Regression对该数据集的预测准确率为：{:.02f}%'.format(k, precRateAvg * 100))
precision, recall, f1 = calPRF(confMatrixAvg)
print('精确率：{:.03f}%'.format(precision * 100))
print('召回率：{:.03f}%'.format(recall * 100))
print('f1：{:.03f}'.format(f1))
confMatrixAvg

经过10-折交叉验证，发现小批量梯度上升的Logistic Regression对该数据集的预测准确率为：90.07%
准确率：91.986%
召回率：91.822%
f1：0.919


Unnamed: 0,one,zero
one,20.66,1.84
zero,1.8,12.3


## Scikit-Learn 实现

### Scikit Learn实现Logistic Regression算法

In [26]:
'''
通过scikit-learn实现Logistic Regression模型
'''
from sklearn.linear_model import LogisticRegression
logiRegModel = LogisticRegression(random_state=0, tol=1e-5, C=1, fit_intercept=True, solver='liblinear')
logiRegModel.fit(trainDataSet.values, trainLabels.values.T[0])
predictResult = logiRegModel.predict(testDataSet.values)
confM, precision = calConfMatrix(predictResult.reshape(-1, 1), testLabels.values)
print('使用scikit-learn的算法进行Logistic Regression分类，准确率有{:.03f}%'.format(precision * 100))
confM

使用scikit-learn的算法进行Logistic Regression分类，准确率有92.537%


Unnamed: 0,one,zero
one,43.0,4.0
zero,1.0,19.0


## 集成算法的实现

In [27]:
from BaggingLR import BaggingLR

with open('./Cache/data_normalization.pkl', 'rb') as f:
    trainDataSetTE, trainLabelsTE, testDataSetTE, testLabelsTE, columnsTE = pickle.load(f)
    trainDataSetTE = pd.DataFrame(trainDataSetTE, columns=columnsTE)
    testDataSetTE = pd.DataFrame(testDataSetTE, columns=columnsTE)
    trainLabelsTE = pd.DataFrame(trainLabelsTE, columns=['Labels'])
    testLabelsTE = pd.DataFrame(testLabelsTE, columns=['Labels'])

bagNum = 10
attributeNum = 13
baggingLR = BaggingLR(bagNum, attributeNum)
startTime = time.time()
baggingLR.fit(trainDataSetTE, trainLabelsTE.values.T, jThreshold=0.95, batchSize=50)
predictResult = baggingLR.predict(testDataSetTE)
endTime = time.time()
confMatrix, accuracy = calConfMatrix(predictResult.reshape((-1, 1)), testLabelsTE.values)
print('通过random算法获取{}个子logistic regression分类器，每个分类器对{}个属性进行运算\n使用改进的小批量梯度上升进行Logistic Regression分类\n通过投票方式获取最终分类结果\n精确率有{:.03f}%\n共计耗时{:.03f}秒'.format(bagNum, attributeNum, accuracy * 100, endTime -  startTime))

通过random算法获取10个子logistic regression分类器，每个分类器对13个属性进行运算
使用改进的小批量梯度上升进行Logistic Regression分类
通过投票方式获取最终分类结果
精确率有92.647%
共计耗时4.759秒
