In [9]:
import pandas as pd
import numpy as np

In [10]:
# 数据读取与预处理
data_path = 'raw_data3.csv'

trivials = ['序号','提交答卷时间', '所用时间', '来源', '来源详情', '来自IP', '总分']
properties = ['gender', 'major', 'grade', 'origin', 'money']

# 0 for subway/bus, 1 for taxi, 2 for bikes/EV, 3 for walking.
# 0 for short, 1 for medium, 2 for long
# 0 for sunny, 1 for rainy
# 0 for time-abundant, 1 for time-constrained

situations = ['101', '001', '211', '000', '200', '011', '010', '110', '201', '210', '111', '100']
new_names = [i + j for i in situations for j in '0123']

df = pd.read_csv(data_path, index_col=0)
df.drop(trivials, axis=1, inplace=True)

new_cols_dict = {k:v for k, v in zip(df.columns, properties + new_names)}
df.rename(columns=new_cols_dict, inplace=True)
df.head()

Unnamed: 0,gender,major,grade,origin,money,1010,1011,1012,1013,0010,...,2102,2103,1110,1111,1112,1113,1000,1001,1002,1003
0,1,3,2,3,3000,40,(空),66,(空),0,...,(空),0,100,(空),(空),(空),100,(空),(空),(空)
1,1,1,2,27,4503,(空),(空),2,100,(空),...,(空),(空),(空),100,(空),(空),(空),(空),(空),100
2,2,2,2,4,2134,(空),(空),102,(空),(空),...,(空),(空),(空),100,(空),(空),(空),(空),(空),100
3,2,2,2,14,2604,19,37,60,(空),(空),...,(空),(空),(空),100,(空),(空),(空),31,58,(空)
4,1,2,1,33,3000,100,60,82,(空),40,...,(空),(空),(空),100,(空),(空),40,50,30,(空)


In [11]:
df.replace('(空)', 0, inplace=True)
df = df.astype(float)
df[properties] = df[properties].astype(int)
df.head()

Unnamed: 0,gender,major,grade,origin,money,1010,1011,1012,1013,0010,...,2102,2103,1110,1111,1112,1113,1000,1001,1002,1003
0,1,3,2,3,3000,40.0,0.0,66.0,0.0,0.0,...,0.0,0.0,100.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0
1,1,1,2,27,4503,0.0,0.0,2.0,100.0,0.0,...,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0
2,2,2,2,4,2134,0.0,0.0,102.0,0.0,0.0,...,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0
3,2,2,2,14,2604,19.0,37.0,60.0,0.0,0.0,...,0.0,0.0,0.0,100.0,0.0,0.0,0.0,31.0,58.0,0.0
4,1,2,1,33,3000,100.0,60.0,82.0,0.0,40.0,...,0.0,0.0,0.0,100.0,0.0,0.0,40.0,50.0,30.0,0.0


In [12]:
def softmax(a):
    '''
    只对非0的部分做softmax
    '''
    a = a.astype(float)
    # 不除以100的话数值太大了
    a /= 100
    non_0_mask = a != 0
    if not np.any(non_0_mask):
        return np.array([0.25, 0.25, 0.25, 0.25])
    minval = np.min(a[non_0_mask])
    a[non_0_mask] -= minval
    a[non_0_mask] = np.exp(a[non_0_mask])
    a[non_0_mask] /= np.sum(a[non_0_mask])
    return a

# test
print(softmax(np.array([40, 0, 66, 0])))
print(softmax(np.array([0, 0, 0, 0])))

[0.43536371 0.         0.56463629 0.        ]
[0.25 0.25 0.25 0.25]


In [13]:
for i in situations:
    start, end =  i + '0', i + '3'
    df.loc[:, start:end] = df.loc[:, start:end].apply(softmax, axis=1)
df.head()

Unnamed: 0,gender,major,grade,origin,money,1010,1011,1012,1013,0010,...,2102,2103,1110,1111,1112,1113,1000,1001,1002,1003
0,1,3,2,3,3000,0.435364,0.0,0.564636,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1,1,2,27,4503,0.0,0.0,0.272892,0.727108,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,2,2,4,2134,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2,2,2,14,2604,0.269976,0.32322,0.406804,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.432907,0.567093,0.0
4,1,2,1,33,3000,0.399108,0.26753,0.333363,0.0,0.189496,...,0.0,0.0,0.0,1.0,0.0,0.0,0.332225,0.367165,0.30061,0.0


In [14]:
# 0 for eastern, 1 for middle, 2 for western.
# 特殊处理：东北三省样本只有两个，暂时决定归入东部
# 港澳台归入东部，没有样本，所以没有影响
origin_mask = [1, 0, 0, 2, 0, 2, 0, 2, 2, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 2, 2, 2, 0, 1, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0]
df['origin'] = df['origin'].map(lambda x: origin_mask[x - 1])
df.head()

Unnamed: 0,gender,major,grade,origin,money,1010,1011,1012,1013,0010,...,2102,2103,1110,1111,1112,1113,1000,1001,1002,1003
0,1,3,2,0,3000,0.435364,0.0,0.564636,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1,1,2,2,4503,0.0,0.0,0.272892,0.727108,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,2,2,2,2134,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2,2,2,1,2604,0.269976,0.32322,0.406804,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.432907,0.567093,0.0
4,1,2,1,0,3000,0.399108,0.26753,0.333363,0.0,0.189496,...,0.0,0.0,0.0,1.0,0.0,0.0,0.332225,0.367165,0.30061,0.0


In [15]:
cleaned_data_path = 'cleaned_data.csv'
df.to_csv(cleaned_data_path)

```python
# TODO: 生源地归类
# TODO: 找几个描述统计量卷可视化
# TODO: 实现Three-way ANOVA
# TODO: 实现多元独立性检验
# TODO: 多元逻辑斯蒂回归
```