### 2021年春 模式识别与机器学习 项目 心跳信号分类预测
### 山东大学 计算机科学与技术学院 智能19 李阳

In [1]:
# 包含所需要的库
# 关于如何直接导出ipynb为pdf且支持中文的参考https://www.cnblogs.com/myfy/p/12829122.html
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import train_test_split

#### 查看数据信息

In [2]:
train_data=pd.read_csv('./train.csv') # 训练集
testA_data=pd.read_csv('./testA.csv') # 测试集
print(train_data.shape)
train_data.head().append(train_data.tail()) # 查看数据的行列信息

(100000, 3)


Unnamed: 0,id,heartbeat_signals,label
0,0,"0.9912297987616655,0.9435330436439665,0.764677...",0.0
1,1,"0.9714822034884503,0.9289687459588268,0.572932...",0.0
2,2,"1.0,0.9591487564065292,0.7013782792997189,0.23...",2.0
3,3,"0.9757952826275774,0.9340884687738161,0.659636...",0.0
4,4,"0.0,0.055816398940721094,0.26129357194994196,0...",2.0
99995,99995,"1.0,0.677705342021188,0.22239242747868546,0.25...",0.0
99996,99996,"0.9268571578157265,0.9063471198026871,0.636993...",2.0
99997,99997,"0.9258351628306013,0.5873839035878395,0.633226...",3.0
99998,99998,"1.0,0.9947621698382489,0.8297017704865509,0.45...",2.0
99999,99999,"0.9259994004527861,0.916476635326053,0.4042900...",0.0


In [3]:
train_data.info # 数据类型

<bound method DataFrame.info of           id                                  heartbeat_signals  label
0          0  0.9912297987616655,0.9435330436439665,0.764677...    0.0
1          1  0.9714822034884503,0.9289687459588268,0.572932...    0.0
2          2  1.0,0.9591487564065292,0.7013782792997189,0.23...    2.0
3          3  0.9757952826275774,0.9340884687738161,0.659636...    0.0
4          4  0.0,0.055816398940721094,0.26129357194994196,0...    2.0
...      ...                                                ...    ...
99995  99995  1.0,0.677705342021188,0.22239242747868546,0.25...    0.0
99996  99996  0.9268571578157265,0.9063471198026871,0.636993...    2.0
99997  99997  0.9258351628306013,0.5873839035878395,0.633226...    3.0
99998  99998  1.0,0.9947621698382489,0.8297017704865509,0.45...    2.0
99999  99999  0.9259994004527861,0.916476635326053,0.4042900...    0.0

[100000 rows x 3 columns]>

In [4]:
train_data.isnull().sum() # 检查是否有缺省值

id                   0
heartbeat_signals    0
label                0
dtype: int64

In [5]:
train_data['label'].value_counts() # 查看预测值的分布情况

0.0    64327
3.0    17912
2.0    14199
1.0     3562
Name: label, dtype: int64

#### 数据处理

In [6]:
# 修改数据类型节省内存，将category类代替object类
def dataProcess(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    return df

In [7]:
# 将题目中所给的heartbeat-signal信息重新整理，并对内存进行优化
train_list=[]
for items in train_data.values:
    train_list.append([items[0]]+[float(i) for i in items[1].split(',')]+[items[2]])
train_data=pd.DataFrame(np.array(train_list))
train_data.columns=['id']+['t'+str(i) for i in range(len(train_list[0])-2)]+['label']
train_data=dataProcess(train_data)

testA_list=[]
for items in testA_data.values:
    testA_list.append([items[0]]+[float(i) for i in items[1].split(',')])
testA_data=pd.DataFrame(np.array(testA_list))
testA_data.columns=['id']+['t'+str(i) for i in range(len(testA_list[0])-1)]
testA_data=dataProcess(testA_data)

train_data.head().append(train_data.tail())

Unnamed: 0,id,t0,t1,t2,t3,t4,t5,t6,t7,t8,...,t196,t197,t198,t199,t200,t201,t202,t203,t204,label
0,0.0,0.991211,0.943359,0.764648,0.618652,0.379639,0.190796,0.040222,0.026001,0.031708,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.97168,0.929199,0.572754,0.178467,0.122986,0.132324,0.094421,0.0896,0.030487,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,1.0,0.958984,0.701172,0.231812,0.0,0.080688,0.128418,0.1875,0.280762,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,3.0,0.975586,0.934082,0.659668,0.249878,0.237061,0.281494,0.249878,0.249878,0.241455,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.055817,0.26123,0.359863,0.433105,0.453613,0.499023,0.542969,0.616699,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
99995,99995.0,1.0,0.677734,0.222412,0.25708,0.204712,0.054657,0.026154,0.118164,0.244873,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,99996.0,0.926758,0.90625,0.637207,0.415039,0.374756,0.382568,0.358887,0.341309,0.336426,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
99997,99997.0,0.925781,0.587402,0.633301,0.632324,0.63916,0.614258,0.599121,0.517578,0.403809,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
99998,99998.0,1.0,0.994629,0.82959,0.458252,0.26416,0.240234,0.213745,0.189331,0.203857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
99999,99999.0,0.925781,0.916504,0.404297,0.0,0.262939,0.385498,0.361084,0.332764,0.339844,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# 去掉id和label，用于模型拟合
x_train=train_data.drop(['id','label'], axis=1)
y_train=train_data['label']
x_test=testA_data.drop(['id'], axis=1)

x_train

Unnamed: 0,t0,t1,t2,t3,t4,t5,t6,t7,t8,t9,...,t195,t196,t197,t198,t199,t200,t201,t202,t203,t204
0,0.991211,0.943359,0.764648,0.618652,0.379639,0.190796,0.040222,0.026001,0.031708,0.065552,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.971680,0.929199,0.572754,0.178467,0.122986,0.132324,0.094421,0.089600,0.030487,0.040497,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.000000,0.958984,0.701172,0.231812,0.000000,0.080688,0.128418,0.187500,0.280762,0.328369,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.975586,0.934082,0.659668,0.249878,0.237061,0.281494,0.249878,0.249878,0.241455,0.230713,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.055817,0.261230,0.359863,0.433105,0.453613,0.499023,0.542969,0.616699,0.676758,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1.000000,0.677734,0.222412,0.257080,0.204712,0.054657,0.026154,0.118164,0.244873,0.328857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,0.926758,0.906250,0.637207,0.415039,0.374756,0.382568,0.358887,0.341309,0.336426,0.317139,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,0.925781,0.587402,0.633301,0.632324,0.639160,0.614258,0.599121,0.517578,0.403809,0.253174,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99998,1.000000,0.994629,0.829590,0.458252,0.264160,0.240234,0.213745,0.189331,0.203857,0.210815,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
