In [2]:
# jupyter notebook 同时输出多行
import import_ipynb
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## for data
import pandas as pd
import numpy as np
import random
import torch
import torch.nn as nn

## for plotting
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
from pyecharts.charts import Bar, Line
from pyecharts import options as opts

## for statistical tests
import scipy
import statsmodels.formula.api as smf
import statsmodels.api as sm

## for machine learning
from sklearn import model_selection, preprocessing, feature_selection, ensemble, linear_model, metrics, decomposition
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.metrics import precision_recall_fscore_support

#jupyter notebook绘图设置
%matplotlib inline

#解决中文显示问题
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 不要发出警告
import warnings
warnings.filterwarnings('ignore') 
# 在画图的代码后面加上分号 ; 也可以不发出警告

In [159]:
# 读取文件
df_file = "data/fuxie_fillna.tsv"
df = pd.read_csv(df_file, sep="\t", index_col=0, encoding="utf-8")
df.columns
df.shape

y = "细菌结果"
# 把细菌结果分为两类：阴性和阳性
df[y] = df[y].map(lambda x: 0 if x == "阴性" else 1)
df.head()

Index(['区县', '性别', 'age', '户籍', '职业', '首发症状', '发热', '体温', '脱水', '腹胀', '恶心',
       '里急后重', '肠鸣音亢进', '腹痛', '腹痛性质', '腹痛部位', '呕吐', '呕吐频次', '呕吐在腹泻___发生',
       '持续天数', '腹泻', '腹泻量', '腹泻频次', '腹泻天数', '腹泻性质', '神经症状', '中毒症状', '其他症状',
       '近6个月有无肠道疾病既往史', '发病前五天内是否有不洁饮食史', '进餐地点', '发病前5天内是否有不洁饮水史',
       '发病前5天内周边有无类似腹泻病例', '疑似病例人数', '发病前5天内是否有聚餐史', '发病前一周是否外出',
       '是否家中饲养或接触过宠物', '就诊前是否服用过抗生素', '诊断', '诊断类型', '临床处理', '本次就诊是否给予抗生素',
       '抗生素名称.1', '是否采集', '采样类型', '细菌结果'],
      dtype='object')

(11600, 46)

Unnamed: 0,区县,性别,age,户籍,职业,首发症状,发热,体温,脱水,腹胀,...,是否家中饲养或接触过宠物,就诊前是否服用过抗生素,诊断,诊断类型,临床处理,本次就诊是否给予抗生素,抗生素名称.1,是否采集,采样类型,细菌结果
1,闵行区,Male,59,本市外来（含港澳台）,医务人员,"腹痛,腹泻",无,38.0,无,有,...,无,无,胃肠炎,实验室诊断,无,无,无,有,食源性采样,1
8,闵行区,Female,36,本市常住,商业服务,"腹痛,腹泻",无,37.0,无,无,...,无,无,急性胃肠炎,临床诊断,口服用药,有,头孢夫辛,有,食源性采样,0
9,闵行区,Female,36,本市常住,工人,腹泻,无,38.0,无,无,...,无,无,肠功能紊乱,临床诊断,口服用药,无,无,有,食源性采样,1
11,闵行区,Female,27,本市常住,不详,无,无,38.0,无,无,...,无,无,肠道功能紊乱,无,口服用药,有,其他(黄连素),有,腹泻病采样,0
12,闵行区,Female,77,本市常住,离退人员,腹泻,无,37.0,无,无,...,无,无,肠功能紊乱,临床诊断,口服用药,无,无,有,食源性采样,0


In [160]:
y_col = ['细菌结果']
cate_col = df.dtypes[df.dtypes == 'object'].index.tolist()
cont_col = df.dtypes[df.dtypes != 'object'].index.tolist()
cont_col.remove('细菌结果')

In [161]:
# 类别特征
cate_col

['区县',
 '性别',
 '户籍',
 '职业',
 '首发症状',
 '发热',
 '脱水',
 '腹胀',
 '恶心',
 '里急后重',
 '肠鸣音亢进',
 '腹痛',
 '腹痛性质',
 '腹痛部位',
 '呕吐',
 '呕吐在腹泻___发生',
 '腹泻',
 '腹泻性质',
 '神经症状',
 '中毒症状',
 '其他症状',
 '近6个月有无肠道疾病既往史',
 '发病前五天内是否有不洁饮食史',
 '进餐地点',
 '发病前5天内是否有不洁饮水史',
 '发病前5天内周边有无类似腹泻病例',
 '发病前5天内是否有聚餐史',
 '发病前一周是否外出',
 '是否家中饲养或接触过宠物',
 '就诊前是否服用过抗生素',
 '诊断',
 '诊断类型',
 '临床处理',
 '本次就诊是否给予抗生素',
 '抗生素名称.1',
 '是否采集',
 '采样类型']

In [323]:
len(cate_col)

37

In [162]:
# 把类别映射为数字，和 one-hot 编码区分
for col in cate_col:
    df[col] = df[col].factorize()[0]
df['腹泻性质'] = df['腹泻性质'] + 1

In [125]:
category_col = df[cate_col]
# 不知道为什么要加 1 但是不加 1 就报错，无语
# category_col.iloc[:,17] = category_col.iloc[:,17] + 1

In [127]:
data = torch.tensor(category_col.iloc[:,17].values)
a = embedding(data)
a.shape

torch.Size([11600, 5])

In [79]:
a[1]
a[2]

tensor([-0.5498, -0.3143,  0.1497,  0.6362, -0.0322], grad_fn=<SelectBackward>)

tensor([-0.5498, -0.3143,  0.1497,  0.6362, -0.0322], grad_fn=<SelectBackward>)

In [124]:
category_col.iloc[:,17].max()
category_col.iloc[:,17].min()
category_col.iloc[:,17].nunique() 

22

-1

24

In [84]:
# 查看每列的类别数
cate_szs = [df[col].nunique() for col in cate_col]

# 嵌入维度限定：折半
emb_szs = [(size, min(50, (size + 1)//2)) for size in cate_szs]

In [85]:
selfembeds = nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in emb_szs])
selfembeds

ModuleList(
  (0): Embedding(17, 9)
  (1): Embedding(2, 1)
  (2): Embedding(5, 3)
  (3): Embedding(22, 11)
  (4): Embedding(16, 8)
  (5): Embedding(2, 1)
  (6): Embedding(3, 2)
  (7): Embedding(2, 1)
  (8): Embedding(2, 1)
  (9): Embedding(2, 1)
  (10): Embedding(2, 1)
  (11): Embedding(2, 1)
  (12): Embedding(3, 2)
  (13): Embedding(30, 15)
  (14): Embedding(2, 1)
  (15): Embedding(4, 2)
  (16): Embedding(2, 1)
  (17): Embedding(24, 12)
  (18): Embedding(8, 4)
  (19): Embedding(2, 1)
  (20): Embedding(5, 3)
  (21): Embedding(2, 1)
  (22): Embedding(2, 1)
  (23): Embedding(53, 27)
  (24): Embedding(2, 1)
  (25): Embedding(2, 1)
  (26): Embedding(2, 1)
  (27): Embedding(2, 1)
  (28): Embedding(2, 1)
  (29): Embedding(2, 1)
  (30): Embedding(65, 33)
  (31): Embedding(3, 2)
  (32): Embedding(11, 6)
  (33): Embedding(2, 1)
  (34): Embedding(93, 47)
  (35): Embedding(2, 1)
  (36): Embedding(10, 5)
)

In [129]:
embeddings = []
# 每一列类别特征都过对应的 embedding layer，Debug 完成
for i,e in enumerate(selfembeds):
    col = torch.tensor(category_col.iloc[:,i].values)
    embed = e(col)
    embeddings.append(embed)
# embeddings

In [222]:
class Logistic_Embedding_Model(nn.Module):
    def __init__(self, emb_szs, n_cont):
        super(Logistic_Embedding_Model, self).__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in emb_szs])
        n_emb = sum((nf for ni,nf in emb_szs))
        n_in = n_emb + n_cont
        self.linear = nn.Linear(n_in,1)
        self.sm = nn.Sigmoid() 
#         self.bn_cont = nn.BatchNorm1d(n_cont)
        
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            col = torch.tensor(x_cat.iloc[:,i].values)
            embeddings.append(e(col))
        x = torch.cat(embeddings, 1)
        
        x_cont = torch.tensor(x_cont.values)
#         x_cont = self.bn_cont(x_cont.double())
        x = torch.cat([x,x_cont],1)
        x = self.linear(x.float())
        x = self.sm(x)
        return x

In [223]:

## 划分数据集
y = "细菌结果"
df_train, df_test = train_test_split(df,test_size=0.3, 
                     stratify=df[y], random_state=42)

## print info
print("X_train shape:", df_train.drop(y,axis=1).shape, "| X_test shape:", df_test.drop(y,axis=1).shape)
print("y_train mean:", round(np.mean(df_train[y]),2), "| y_test mean:", round(np.mean(df_test[y]),2))
print(df_train.shape[1], "features")

print('-'*50)

## 查看各个类别的比例
print("训练集：")
print(df_train[y].value_counts() / len(df_train[y]))
print("测试集：")
print(df_test[y].value_counts() / len(df_test[y]))

X_train shape: (8120, 45) | X_test shape: (3480, 45)
y_train mean: 0.2 | y_test mean: 0.2
46 features
--------------------------------------------------
训练集：
0    0.795813
1    0.204187
Name: 细菌结果, dtype: float64
测试集：
0    0.79569
1    0.20431
Name: 细菌结果, dtype: float64


In [224]:
cat_train = df_train[cate_col]
cat_test = df_test[cate_col]
cont_train = df_train[cont_col]
cont_test = df_test[cont_col]

y_train = torch.tensor(df_train[y].values).unsqueeze(1)
y_test = torch.tensor(df_test[y].values).unsqueeze(1)

torch.manual_seed(33)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Logistic_Embedding_Model(emb_szs, n_cont=cont_train.shape[1])

<torch._C.Generator at 0x2248b6c26b0>

In [227]:
# 定义损失函数和优化器
loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(),lr = 0.001)
epochs = 1000

In [228]:
loss_list = []

# 开始训练
for epoch in range(epochs):
    out = model(cat_train,cont_train)
    loss = loss_func(out,y_train.float())
    print_loss = loss.data.item()
    loss_list.append(loss)
    mask = out.ge(0.5).float() # 以 0.5 为阈值进行分类
    correct = (mask == y_train).sum()  # 计算正确预测的样本个数
    acc = correct.item() / len(y_train)  # 计算精度
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # 每隔20轮打印一下当前的误差和精度
    if (epoch + 1) % 20 == 0:
        print('*'*10)
        print('epoch {}'.format(epoch+1)) # 训练轮数
        print('loss is {:.4f}'.format(print_loss))  # 误差
        print('acc is {:.4f}'.format(acc))  # 精度
    

**********
epoch 20
loss is 0.4695
acc is 0.8018
**********
epoch 40
loss is 0.4547
acc is 0.8137
**********
epoch 60
loss is 0.4465
acc is 0.8179
**********
epoch 80
loss is 0.4400
acc is 0.8191
**********
epoch 100
loss is 0.4344
acc is 0.8218
**********
epoch 120
loss is 0.4295
acc is 0.8254
**********
epoch 140
loss is 0.4251
acc is 0.8272
**********
epoch 160
loss is 0.4211
acc is 0.8307
**********
epoch 180
loss is 0.4175
acc is 0.8324
**********
epoch 200
loss is 0.4141
acc is 0.8341
**********
epoch 220
loss is 0.4111
acc is 0.8347
**********
epoch 240
loss is 0.4084
acc is 0.8347
**********
epoch 260
loss is 0.4060
acc is 0.8351
**********
epoch 280
loss is 0.4038
acc is 0.8358
**********
epoch 300
loss is 0.4019
acc is 0.8352
**********
epoch 320
loss is 0.4002
acc is 0.8353
**********
epoch 340
loss is 0.3986
acc is 0.8350
**********
epoch 360
loss is 0.3972
acc is 0.8353
**********
epoch 380
loss is 0.3960
acc is 0.8366
**********
epoch 400
loss is 0.3949
acc is 0.8369
****

In [243]:
# 拿到 embedding
feature_embedding_dict = {}
for i,matrix in enumerate(model.embeds.parameters()):
    feature_embedding_dict[i] = matrix.detach().numpy()   

In [260]:
feature_embedding_dict

{0: array([[ 1.0447642 , -0.28745875, -0.9292264 ,  1.3636967 ,  1.0303468 ,
          0.16512181,  0.68433005, -0.3818424 , -0.7667461 ],
        [ 1.4939973 ,  0.30898508, -0.7009636 ,  0.55971   , -1.1512969 ,
          0.530057  ,  0.0669304 , -0.7283815 , -1.2392782 ],
        [ 1.2891383 , -0.8224187 ,  0.21893977,  1.3248755 , -0.736689  ,
          0.5438528 ,  0.58486676, -1.2495565 , -0.2159929 ],
        [ 0.5761816 , -0.4100114 , -1.0544024 , -1.0377166 , -0.82481   ,
         -0.7769867 , -0.1825959 , -1.3353372 ,  1.0633332 ],
        [-1.3759376 , -0.27782708, -1.1572467 ,  1.639822  , -0.18069305,
          0.34644213,  0.23559971,  0.8901335 ,  0.5745679 ],
        [ 0.62821823, -0.21001649, -0.85595924, -0.52902865, -0.6503815 ,
          2.041521  , -2.2366607 , -0.84018576, -1.1316888 ],
        [-0.23065597,  1.2143242 , -0.00848766, -0.5423624 ,  0.95886683,
          0.721951  , -0.14998381, -0.48132768,  0.87708706],
        [-2.1289008 , -0.6935833 , -3.0803595

In [247]:
feature_embedding_dict[0][1]

array([ 1.4939973 ,  0.30898508, -0.7009636 ,  0.55971   , -1.1512969 ,
        0.530057  ,  0.0669304 , -0.7283815 , -1.2392782 ], dtype=float32)

In [302]:
df = df.reset_index().drop('index',axis=1)

In [303]:
df_cate = df[cate_col]
df_cont = df[cont_col]
df_cate

Unnamed: 0,区县,性别,户籍,职业,首发症状,发热,脱水,腹胀,恶心,里急后重,...,发病前一周是否外出,是否家中饲养或接触过宠物,就诊前是否服用过抗生素,诊断,诊断类型,临床处理,本次就诊是否给予抗生素,抗生素名称.1,是否采集,采样类型
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,1,0,0,0,1,1,1,...,0,0,0,1,1,1,1,1,0,0
2,0,1,1,2,1,0,0,1,1,1,...,0,0,0,2,1,1,0,0,0,0
3,0,1,1,3,2,0,0,1,0,1,...,0,0,0,3,2,1,1,2,0,1
4,0,1,1,4,1,0,0,1,1,0,...,0,0,0,2,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11595,14,1,1,4,1,0,0,1,1,1,...,0,0,0,6,1,1,1,3,0,2
11596,7,0,2,5,0,0,0,1,1,1,...,0,0,0,1,2,1,1,3,0,2
11597,7,0,2,5,5,1,0,1,0,1,...,0,0,0,1,2,0,1,12,0,0
11598,14,0,1,4,1,0,0,1,1,1,...,0,0,0,6,2,2,1,12,0,1


In [284]:
# test
temp_list = []
x = df.iloc[0, 0]
temp_list.append(feature_embedding_dict[0][x])
temp_list.append(feature_embedding_dict[0][x])
# a = pd.DataFrame(temp_list,columns=["区县_dim"+str(i) for i in range(feature_embedding_dict[0].shape[1])])
a = pd.DataFrame(temp_list,
                 columns=[
                     f"{df.iloc[:,0].name}_dim" + str(i)
                     for i in range(feature_embedding_dict[0].shape[1])
                 ])
a

Unnamed: 0,区县_dim0,区县_dim1,区县_dim2,区县_dim3,区县_dim4,区县_dim5,区县_dim6,区县_dim7,区县_dim8
0,1.044764,-0.287459,-0.929226,1.363697,1.030347,0.165122,0.68433,-0.381842,-0.766746
1,1.044764,-0.287459,-0.929226,1.363697,1.030347,0.165122,0.68433,-0.381842,-0.766746


In [293]:
## converting the values into dataframe and attaching them to the dataframe
# col 其实也是数字，第col列
def to_embed(df, col):
    temp_list = []
    for i in range(len(df.iloc[:, col])):
        # x 为第 i 行第 col 列的值
        x = df.iloc[i, col]
        temp_list.append(feature_embedding_dict[col][x])
    temp = pd.DataFrame(
        temp_list,
        columns=[
            f"{df.iloc[:,col].name}_dim" + str(i)
            for i in range(feature_embedding_dict[col].shape[1])
        ])
    return temp

In [298]:
embed_file = pd.DataFrame()
for col, col_name in enumerate(cate_col):
    temp = to_embed(df_cate, col)
    embed_file = pd.concat([embed_file, temp], axis=1)

In [299]:
embed_file

Unnamed: 0,区县_dim0,区县_dim1,区县_dim2,区县_dim3,区县_dim4,区县_dim5,区县_dim6,区县_dim7,区县_dim8,性别_dim0,...,抗生素名称.1_dim43,抗生素名称.1_dim44,抗生素名称.1_dim45,抗生素名称.1_dim46,是否采集_dim0,采样类型_dim0,采样类型_dim1,采样类型_dim2,采样类型_dim3,采样类型_dim4
0,1.044764,-0.287459,-0.929226,1.363697,1.030347,0.165122,0.684330,-0.381842,-0.766746,0.489578,...,-0.921288,0.123755,0.497878,-1.011999,-2.358704,-1.039336,1.147461,-1.079278,-0.179188,1.506746
1,1.044764,-0.287459,-0.929226,1.363697,1.030347,0.165122,0.684330,-0.381842,-0.766746,-1.105435,...,-0.558363,-1.228932,1.132666,-0.630467,-2.358704,-1.039336,1.147461,-1.079278,-0.179188,1.506746
2,1.044764,-0.287459,-0.929226,1.363697,1.030347,0.165122,0.684330,-0.381842,-0.766746,-1.105435,...,-0.921288,0.123755,0.497878,-1.011999,-2.358704,-1.039336,1.147461,-1.079278,-0.179188,1.506746
3,1.044764,-0.287459,-0.929226,1.363697,1.030347,0.165122,0.684330,-0.381842,-0.766746,-1.105435,...,0.281324,-0.944850,-1.700922,0.925396,-2.358704,0.680637,0.547636,-0.605645,0.413995,-1.852076
4,1.044764,-0.287459,-0.929226,1.363697,1.030347,0.165122,0.684330,-0.381842,-0.766746,-1.105435,...,-0.921288,0.123755,0.497878,-1.011999,-2.358704,-1.039336,1.147461,-1.079278,-0.179188,1.506746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11595,-1.496031,1.331003,0.952992,-1.116918,-2.249046,-1.051940,1.261768,1.416876,-1.580085,-1.105435,...,1.824170,1.167687,1.993183,0.336311,-2.358704,-0.183470,-0.286464,-0.186309,-0.492161,1.245407
11596,-2.128901,-0.693583,-3.080359,-1.089935,0.578550,-0.567628,-0.037047,-0.369046,1.607046,0.489578,...,1.824170,1.167687,1.993183,0.336311,-2.358704,-0.183470,-0.286464,-0.186309,-0.492161,1.245407
11597,-2.128901,-0.693583,-3.080359,-1.089935,0.578550,-0.567628,-0.037047,-0.369046,1.607046,0.489578,...,-0.898020,0.491162,1.311534,1.074226,-2.358704,-1.039336,1.147461,-1.079278,-0.179188,1.506746
11598,-1.496031,1.331003,0.952992,-1.116918,-2.249046,-1.051940,1.261768,1.416876,-1.580085,0.489578,...,-0.898020,0.491162,1.311534,1.074226,-2.358704,0.680637,0.547636,-0.605645,0.413995,-1.852076


In [307]:
# 把数值变量和label都拼接起来
embed_file = pd.concat([embed_file, df_cont,df['细菌结果']], axis=1)
embed_file

Unnamed: 0,区县_dim0,区县_dim1,区县_dim2,区县_dim3,区县_dim4,区县_dim5,区县_dim6,区县_dim7,区县_dim8,性别_dim0,...,疑似病例人数,age,体温,呕吐频次,持续天数,腹泻量,腹泻频次,腹泻天数,疑似病例人数.1,细菌结果
0,1.044764,-0.287459,-0.929226,1.363697,1.030347,0.165122,0.684330,-0.381842,-0.766746,0.489578,...,0.0,59,38.0,0.0,0.0,2.0,5.0,2.0,0.0,1
1,1.044764,-0.287459,-0.929226,1.363697,1.030347,0.165122,0.684330,-0.381842,-0.766746,-1.105435,...,0.0,36,37.0,0.0,0.0,3.0,3.0,1.0,0.0,0
2,1.044764,-0.287459,-0.929226,1.363697,1.030347,0.165122,0.684330,-0.381842,-0.766746,-1.105435,...,0.0,36,38.0,0.0,0.0,2.0,3.0,2.0,0.0,1
3,1.044764,-0.287459,-0.929226,1.363697,1.030347,0.165122,0.684330,-0.381842,-0.766746,-1.105435,...,0.0,27,38.0,0.0,0.0,2.0,2.0,3.0,0.0,0
4,1.044764,-0.287459,-0.929226,1.363697,1.030347,0.165122,0.684330,-0.381842,-0.766746,-1.105435,...,0.0,77,37.0,0.0,0.0,2.0,3.0,2.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11595,-1.496031,1.331003,0.952992,-1.116918,-2.249046,-1.051940,1.261768,1.416876,-1.580085,-1.105435,...,0.0,60,38.0,0.0,0.0,2.0,8.0,1.0,0.0,1
11596,-2.128901,-0.693583,-3.080359,-1.089935,0.578550,-0.567628,-0.037047,-0.369046,1.607046,0.489578,...,0.0,33,38.0,0.0,0.0,2.0,10.0,1.0,0.0,0
11597,-2.128901,-0.693583,-3.080359,-1.089935,0.578550,-0.567628,-0.037047,-0.369046,1.607046,0.489578,...,0.0,35,38.8,0.0,0.0,2.0,9.0,1.0,0.0,1
11598,-1.496031,1.331003,0.952992,-1.116918,-2.249046,-1.051940,1.261768,1.416876,-1.580085,0.489578,...,0.0,69,38.0,0.0,0.0,2.0,4.0,1.0,0.0,1


In [315]:
df_file = "data/fuxie_fillna_embed.tsv"
embed_file.to_csv(df_file, sep="\t", encoding="utf-8")

# References
1. https://github.com/oegedijk/keras-embeddings/blob/master/build_embeddings.py

In [10]:
df_file = "data/fuxie_fillna_embed.tsv"
df = pd.read_csv(df_file, sep="\t", index_col=0, encoding="utf-8")

In [13]:
df = df.drop("Unnamed: 0.1",axis=1)

In [14]:
df

Unnamed: 0,区县_dim0,区县_dim1,区县_dim2,区县_dim3,区县_dim4,区县_dim5,区县_dim6,区县_dim7,区县_dim8,性别_dim0,...,采样类型_dim4,age,体温,呕吐频次,持续天数,腹泻量,腹泻频次,腹泻天数,疑似病例人数,细菌结果
0,1.044764,-0.287459,-0.929226,1.363697,1.030347,0.165122,0.684330,-0.381842,-0.766746,0.489578,...,1.506746,59,38.0,0.0,0.0,2.0,5.0,2.0,0.0,1
1,1.044764,-0.287459,-0.929226,1.363697,1.030347,0.165122,0.684330,-0.381842,-0.766746,-1.105435,...,1.506746,36,37.0,0.0,0.0,3.0,3.0,1.0,0.0,0
2,1.044764,-0.287459,-0.929226,1.363697,1.030347,0.165122,0.684330,-0.381842,-0.766746,-1.105435,...,1.506746,36,38.0,0.0,0.0,2.0,3.0,2.0,0.0,1
3,1.044764,-0.287459,-0.929226,1.363697,1.030347,0.165122,0.684330,-0.381842,-0.766746,-1.105435,...,-1.852076,27,38.0,0.0,0.0,2.0,2.0,3.0,0.0,0
4,1.044764,-0.287459,-0.929226,1.363697,1.030347,0.165122,0.684330,-0.381842,-0.766746,-1.105435,...,1.506746,77,37.0,0.0,0.0,2.0,3.0,2.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11595,-1.496031,1.331003,0.952992,-1.116918,-2.249046,-1.051940,1.261768,1.416876,-1.580085,-1.105435,...,1.245407,60,38.0,0.0,0.0,2.0,8.0,1.0,0.0,1
11596,-2.128901,-0.693583,-3.080359,-1.089935,0.578550,-0.567628,-0.037047,-0.369046,1.607046,0.489578,...,1.245407,33,38.0,0.0,0.0,2.0,10.0,1.0,0.0,0
11597,-2.128901,-0.693583,-3.080359,-1.089935,0.578550,-0.567628,-0.037047,-0.369046,1.607046,0.489578,...,1.506746,35,38.8,0.0,0.0,2.0,9.0,1.0,0.0,1
11598,-1.496031,1.331003,0.952992,-1.116918,-2.249046,-1.051940,1.261768,1.416876,-1.580085,0.489578,...,-1.852076,69,38.0,0.0,0.0,2.0,4.0,1.0,0.0,1


In [11]:
df = df.drop(['age.1','体温.1','呕吐频次.1','持续天数.1','腹泻量.1','腹泻频次.1','腹泻天数.1','疑似病例人数.1'],axis=1)
df

KeyError: "['age.1' '体温.1' '呕吐频次.1' '持续天数.1' '腹泻量.1' '腹泻频次.1' '腹泻天数.1' '疑似病例人数.1'] not found in axis"

In [15]:
df_file = "data/fuxie_fillna_embed.tsv"
df.to_csv(df_file, sep="\t", encoding="utf-8")