In [1]:
import import_ipynb
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## for data
import pandas as pd
import numpy as np
import random
import torch
import torch.nn as nn

## for plotting
import matplotlib.pyplot as plt
import seaborn as sns

## for machine learning
from sklearn.model_selection import train_test_split

%matplotlib inline

## Chinese display
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False

## No warnings
import warnings
warnings.filterwarnings('ignore') 

In [2]:
df_file = "data/Diarrhea_onehot_382.tsv"
df = pd.read_csv(df_file, sep="\t", index_col=0, encoding="utf-8")
df.shape

(11600, 383)

In [3]:
class Logistic_Embedding_Model(nn.Module):
    def __init__(self, num, emb_szs):
        # num = 382
        super(Logistic_Embedding_Model, self).__init__()
        self.embedding = nn.Linear(num, emb_szs)
        self.linear = nn.Linear(emb_szs,1)
        self.sm = nn.Sigmoid() 
        # self.bn_cont = nn.BatchNorm1d(n_cont)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.linear(x.float())
        x = self.sm(x)
        return x

In [4]:
y = '细菌结果'

# Stratified sampling
df_train, df_test = train_test_split(df, test_size=0.3,
                                     stratify=df[y], random_state=42)

# print info

print("X_train shape:", df_train.drop(y, axis=1).shape,
      "| X_test shape:", df_test.drop(y, axis=1).shape)
print("y_train mean:", round(
    np.mean(df_train[y]), 2), "| y_test mean:", round(np.mean(df_test[y]), 2))

print('-'*50)

print("Train set：")
print(df_train[y].value_counts() / len(df_train[y]))
print("Test set：")
print(df_test[y].value_counts() / len(df_test[y]))

X_train shape: (8120, 382) | X_test shape: (3480, 382)
y_train mean: 0.2 | y_test mean: 0.2
--------------------------------------------------
Train set：
0.0    0.795813
1.0    0.204187
Name: 细菌结果, dtype: float64
Test set：
0.0    0.79569
1.0    0.20431
Name: 细菌结果, dtype: float64


In [5]:

y_train = torch.tensor(df_train[y].values).unsqueeze(1)
y_test = torch.tensor(df_test[y].values).unsqueeze(1)


torch.manual_seed(33)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 设置一个固定值
emb_szs = 100
model = Logistic_Embedding_Model(df_train.drop(y, axis=1).shape[1], emb_szs)

<torch._C.Generator at 0x1f4cc613f10>

In [6]:
# Define loss functions and optimizers
loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(),lr = 0.001)
epochs = 1000

In [7]:
loss_list = []
from tqdm import tqdm
# Start training
with tqdm(range(epochs)) as tq:
    for epoch in tq:
        out = model(torch.FloatTensor(df_train.drop(y, axis=1).values))
        loss = loss_func(out,y_train.float())
        print_loss = loss.data.item()
        loss_list.append(loss)
        mask = out.ge(0.5).float() 
        correct = (mask == y_train).sum()  
        acc = correct.item() / len(y_train)  
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        tq.set_postfix({'loss': '%.03f' % loss.item()}, refresh=False)
        # if (epoch + 1) % 20 == 0:
        #     print('*'*10)
        #     print('epoch {}'.format(epoch+1)) 
        #     print('loss is {:.4f}'.format(print_loss))  
        #     print('acc is {:.4f}'.format(acc))  

100%|██████████| 1000/1000 [00:27<00:00, 36.41it/s, loss=0.385]


In [8]:
# Get embedding
df_emb = pd.DataFrame(model.embedding(torch.FloatTensor(df.drop(y, axis=1).values)).detach().numpy())   

In [9]:
df_emb = pd.concat([df_emb, df[y]], axis=1)

In [10]:
df_file = "data/Diarrhea_embed_100.tsv"
df_emb.to_csv(df_file, sep="\t", encoding="utf-8")