In [1]:
# jupyter notebook 同时输出多行
import import_ipynb
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## for data
import pandas as pd
import numpy as np
import random
import torch
import torch.nn as nn

## for plotting
import matplotlib.pyplot as plt
import seaborn as sns

## for machine learning
from sklearn.model_selection import train_test_split

%matplotlib inline

## Chinese display
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False

## No warnings
import warnings
warnings.filterwarnings('ignore') 

In [44]:
df_file = "data/Diarrhea_fillna.tsv"
df = pd.read_csv(df_file, sep="\t", index_col=0, encoding="utf-8")
df.shape

y = "细菌结果"
df[y] = df[y].map(lambda x: 0 if x == "阴性" else 1)

(11600, 46)

In [45]:
y_col = ['细菌结果']
cate_col = df.dtypes[df.dtypes == 'object'].index.tolist()
cont_col = df.dtypes[df.dtypes != 'object'].index.tolist()
cont_col.remove('细菌结果')

In [46]:
print("Number of category features: ",len(cate_col))

Number of category features:  37


In [47]:
# Mapping categories to numbers
for col in cate_col:
    df[col] = df[col].factorize()[0]
df['腹泻性质'] = df['腹泻性质'] + 1

In [48]:
class Logistic_Embedding_Model(nn.Module):
    def __init__(self, emb_szs, n_cont):
        super(Logistic_Embedding_Model, self).__init__()
        # The embedded dimension is half of the original dimension
        self.embeds = nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in emb_szs])
        n_emb = sum((nf for ni,nf in emb_szs))
        n_in = n_emb + n_cont
        self.linear = nn.Linear(n_in,1)
        self.sm = nn.Sigmoid() 
        # self.bn_cont = nn.BatchNorm1d(n_cont)
        
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            col = torch.tensor(x_cat.iloc[:,i].values)
            embeddings.append(e(col))
        x = torch.cat(embeddings, 1)
        
        x_cont = torch.tensor(x_cont.values)
        # x_cont = self.bn_cont(x_cont.double())
        x = torch.cat([x,x_cont],1)
        x = self.linear(x.float())
        x = self.sm(x)
        return x

In [49]:
# Stratified sampling
df_train, df_test = train_test_split(df, test_size=0.3,
                                     stratify=df[y], random_state=42)

# print info
print("X_train shape:", df_train.drop(y, axis=1).shape,
      "| X_test shape:", df_test.drop(y, axis=1).shape)
print("y_train mean:", round(
    np.mean(df_train[y]), 2), "| y_test mean:", round(np.mean(df_test[y]), 2))

print('-'*50)

print("Train set：")
print(df_train[y].value_counts() / len(df_train[y]))
print("Test set：")
print(df_test[y].value_counts() / len(df_test[y]))

X_train shape: (8120, 45) | X_test shape: (3480, 45)
y_train mean: 0.2 | y_test mean: 0.2
--------------------------------------------------
Train set：
0    0.795813
1    0.204187
Name: 细菌结果, dtype: float64
Test set：
0    0.79569
1    0.20431
Name: 细菌结果, dtype: float64


In [50]:
cat_train = df_train[cate_col]
cat_test = df_test[cate_col]
cont_train = df_train[cont_col]
cont_test = df_test[cont_col]

y_train = torch.tensor(df_train[y].values).unsqueeze(1)
y_test = torch.tensor(df_test[y].values).unsqueeze(1)

torch.manual_seed(33)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cate_szs = [df[col].nunique() for col in cate_col]
emb_szs = [(size, min(50, (size + 1)//2)) for size in cate_szs]
model = Logistic_Embedding_Model(emb_szs, n_cont=cont_train.shape[1])

<torch._C.Generator at 0x1e5b94b5230>

In [51]:
# Define loss functions and optimizers
loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(),lr = 0.001)
epochs = 1000

In [52]:
loss_list = []

# Start training
for epoch in range(epochs):
    out = model(cat_train,cont_train)
    loss = loss_func(out,y_train.float())
    print_loss = loss.data.item()
    loss_list.append(loss)
    mask = out.ge(0.5).float() 
    correct = (mask == y_train).sum()  
    acc = correct.item() / len(y_train)  
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # if (epoch + 1) % 20 == 0:
    #     print('*'*10)
    #     print('epoch {}'.format(epoch+1)) 
    #     print('loss is {:.4f}'.format(print_loss))  
    #     print('acc is {:.4f}'.format(acc))  

In [53]:
# Get embedding
feature_embedding_dict = {}
for i,matrix in enumerate(model.embeds.parameters()):
    feature_embedding_dict[i] = matrix.detach().numpy()   

In [54]:
feature_embedding_dict[0][1]

array([ 1.2983361 ,  0.39348063, -0.4909039 ,  0.08802915, -1.4797117 ,
        0.43262202,  0.5034583 , -0.45339465, -1.4613835 ], dtype=float32)

In [67]:
df = df.reset_index().drop('index',axis=1)

In [68]:
df_cate = df[cate_col]
df_cont = df[cont_col]

In [69]:
## converting the values into dataframe and attaching them to the dataframe
def to_embed(df, col):
    temp_list = []
    for i in range(len(df.iloc[:, col])):
        # x 为第 i 行第 col 列的值
        x = df.iloc[i, col]
        temp_list.append(feature_embedding_dict[col][x])
    temp = pd.DataFrame(
        temp_list,
        columns=[
            f"{df.iloc[:,col].name}_dim" + str(i)
            for i in range(feature_embedding_dict[col].shape[1])
        ])
    return temp

In [70]:
embed_file = pd.DataFrame()
for col, col_name in enumerate(cate_col):
    temp = to_embed(df_cate, col)
    embed_file = pd.concat([embed_file, temp], axis=1)

In [71]:
embed_file = pd.concat([embed_file, df_cont, df['细菌结果']], axis=1)
embed_file.shape

(11600, 220)

In [72]:
df_file = "data/Diarrhea_embed_219.tsv"
embed_file.to_csv(df_file, sep="\t", encoding="utf-8")