In [1]:
#导入包
import pandas as pd
import numpy as np

#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)

# 数据预处理

## 已进行的处理
* 数据读取
* 分割lable项Y和输入项X
* 对Y中的不合理数据做调整

## 准备进行的处理
* 将时间标签化为为易于处理的多个标签
* 数据随机排序划分训练集
* 独热编码
* 将train和test中的数据做交叉验证，用以提高数据量
* 异常值处理

In [7]:
#数据读取
train = pd.read_csv("../happiness_train_abbr.csv")
test = pd.read_csv("../happiness_test_abbr.csv",index_col='id')

In [8]:
#观察训练集
train.head()

Unnamed: 0,id,happiness,survey_type,province,city,county,survey_time,gender,birth,nationality,religion,religion_freq,edu,income,political,floor_area,height_cm,weight_jin,health,health_problem,depression,hukou,socialize,relax,learn,equity,class,work_exper,work_status,work_yr,work_type,work_manage,family_income,family_m,family_status,house,car,marital,status_peer,status_3_before,view,inc_ability
0,1,4,1,12,32,59,2015/8/4 14:18,1,1959,1,1,1,11,20000,1,45.0,176,155,3,2,5,5,2,4,3,3,3,1,3.0,30.0,1.0,2.0,60000.0,2,2,1,2,3,3,2,4,3
1,2,4,2,18,52,85,2015/7/21 15:04,1,1992,1,1,1,12,20000,1,110.0,170,110,5,4,3,1,2,4,3,3,6,1,3.0,2.0,1.0,3.0,40000.0,3,4,1,2,1,1,1,4,2
2,3,4,2,29,83,126,2015/7/21 13:24,2,1967,1,0,3,4,2000,1,120.0,160,122,4,4,5,1,3,4,2,4,5,2,,,,,8000.0,3,3,1,2,3,2,1,4,2
3,4,5,2,10,28,51,2015/7/25 17:33,2,1943,1,1,1,3,6420,1,78.0,163,170,4,4,4,1,2,4,4,4,5,4,,,,,12000.0,3,3,1,1,7,2,1,3,2
4,5,4,1,7,18,36,2015/8/10 9:50,2,1994,1,1,1,12,-1,2,70.0,165,110,5,5,3,2,4,3,4,2,1,6,,,,,-2.0,4,3,1,1,1,3,2,3,-8


In [4]:
#数据量
train.shape

(8000, 41)

In [5]:
#数据缺失
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8000 entries, 1 to 8000
Data columns (total 41 columns):
happiness          8000 non-null int64
survey_type        8000 non-null int64
province           8000 non-null int64
city               8000 non-null int64
county             8000 non-null int64
survey_time        8000 non-null object
gender             8000 non-null int64
birth              8000 non-null int64
nationality        8000 non-null int64
religion           8000 non-null int64
religion_freq      8000 non-null int64
edu                8000 non-null int64
income             8000 non-null int64
political          8000 non-null int64
floor_area         8000 non-null float64
height_cm          8000 non-null int64
weight_jin         8000 non-null int64
health             8000 non-null int64
health_problem     8000 non-null int64
depression         8000 non-null int64
hukou              8000 non-null int64
socialize          8000 non-null int64
relax              8000 non-null

> 可以看出，该数据中work_status，work_yr，work_type，work_manage缺失严重，可以考虑将此3个维度删除。而family_income只有一个缺失值，可用均值填补。
* (使用人工神经网络模型可以有效应对数据缺失，因此可以不做处理)

In [10]:
#提取X

#X = train.drop(columns=['id','happiness','work_status','work_yr','work_type','work_manage'])
X = train.drop(columns=['id','happiness'])

#填补缺失值
X.fillna(X.mean()['family_income'])
X.head()

Unnamed: 0,survey_type,province,city,county,survey_time,gender,birth,nationality,religion,religion_freq,edu,income,political,floor_area,height_cm,weight_jin,health,health_problem,depression,hukou,socialize,relax,learn,equity,class,work_exper,work_status,work_yr,work_type,work_manage,family_income,family_m,family_status,house,car,marital,status_peer,status_3_before,view,inc_ability
0,1,12,32,59,2015/8/4 14:18,1,1959,1,1,1,11,20000,1,45.0,176,155,3,2,5,5,2,4,3,3,3,1,3.0,30.0,1.0,2.0,60000.0,2,2,1,2,3,3,2,4,3
1,2,18,52,85,2015/7/21 15:04,1,1992,1,1,1,12,20000,1,110.0,170,110,5,4,3,1,2,4,3,3,6,1,3.0,2.0,1.0,3.0,40000.0,3,4,1,2,1,1,1,4,2
2,2,29,83,126,2015/7/21 13:24,2,1967,1,0,3,4,2000,1,120.0,160,122,4,4,5,1,3,4,2,4,5,2,,,,,8000.0,3,3,1,2,3,2,1,4,2
3,2,10,28,51,2015/7/25 17:33,2,1943,1,1,1,3,6420,1,78.0,163,170,4,4,4,1,2,4,4,4,5,4,,,,,12000.0,3,3,1,1,7,2,1,3,2
4,1,7,18,36,2015/8/10 9:50,2,1994,1,1,1,12,-1,2,70.0,165,110,5,5,3,2,4,3,4,2,1,6,,,,,-2.0,4,3,1,1,1,3,2,3,-8


In [19]:
#查看label分布
Y = train["happiness"]
Y.value_counts()

 4    4818
 5    1410
 3    1159
 2     497
 1     104
-8      12
Name: happiness, dtype: int64

In [None]:
#数据字典中-8表示不知道，可将其用3填补
Y = Y.map(lambda x:3 if x== -8 else x)