# EDA

In [1]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [2]:
data_raw = pd.read_csv('../data/curated/combined_data_div_binned.csv')
label = pd.read_csv('../data/curated/combined_label.csv')

print("Number of rows in data =",data_raw.shape[0])
print("Number of columns in data =",data_raw.shape[1])
print("\n")
print("**Sample data:**")
data_raw.head()

Number of rows in data = 7971
Number of columns in data = 528


**Sample data:**


Unnamed: 0,PtAge,WBC,RBC,HGB,HCT,MCV,MCH,MCHC,PLT,RDW-SD,...,CRP_div_HFLC1,CRP_div_HFLC2,CRP_div_NRBC#,CRP_div_NRBC%,HFLC1_div_HFLC2,HFLC1_div_NRBC#,HFLC1_div_NRBC%,HFLC2_div_NRBC#,HFLC2_div_NRBC%,NRBC#_div_NRBC%
0,9.0,5.0,1.0,1.0,1.0,1.0,1.0,4.0,5.0,0.0,...,8.0,7.0,9.0,9.0,2.0,6.0,6.0,6.0,6.0,3.0
1,7.0,1.0,3.0,3.0,4.0,8.0,10.0,5.0,1.0,8.0,...,9.0,3.0,4.0,4.0,1.0,3.0,3.0,5.0,5.0,3.0
2,4.0,0.0,3.0,3.0,3.0,8.0,10.0,6.0,7.0,6.0,...,9.0,10.0,10.0,10.0,0.0,6.0,6.0,2.0,2.0,3.0
3,2.0,7.0,2.0,3.0,2.0,8.0,10.0,7.0,5.0,7.0,...,10.0,10.0,10.0,10.0,9.0,3.0,3.0,2.0,2.0,3.0
4,1.0,2.0,6.0,5.0,5.0,5.0,6.0,5.0,3.0,6.0,...,7.0,6.0,10.0,10.0,3.0,7.0,7.0,7.0,7.0,3.0


In [3]:
data_raw.describe()

Unnamed: 0,PtAge,WBC,RBC,HGB,HCT,MCV,MCH,MCHC,PLT,RDW-SD,...,CRP_div_HFLC1,CRP_div_HFLC2,CRP_div_NRBC#,CRP_div_NRBC%,HFLC1_div_HFLC2,HFLC1_div_NRBC#,HFLC1_div_NRBC%,HFLC2_div_NRBC#,HFLC2_div_NRBC%,NRBC#_div_NRBC%
count,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,...,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0
mean,4.461297,3.410613,5.76954,5.060595,5.276001,6.471584,7.8206,5.149542,4.167482,6.029106,...,6.918078,5.698658,5.553381,5.55313,5.031866,5.170368,5.168988,4.585497,4.577343,2.305357
std,2.405994,2.208716,2.9261,2.639335,2.655332,2.201126,2.863509,2.095396,2.108578,2.224627,...,2.553211,2.530719,3.264232,3.264544,2.81116,2.604615,2.606366,2.391182,2.399683,0.951836
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,2.0,3.0,3.0,3.0,5.0,6.0,4.0,3.0,5.0,...,5.0,3.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,1.0
50%,4.0,2.0,6.0,5.0,5.0,7.0,9.0,5.0,3.0,6.0,...,7.0,6.0,5.0,5.0,6.0,6.0,6.0,5.0,5.0,3.0
75%,7.0,5.0,9.0,8.0,8.0,8.0,10.0,7.0,5.0,8.0,...,9.0,7.0,8.0,8.0,8.0,7.0,7.0,6.0,6.0,3.0
max,9.0,9.0,10.0,9.0,10.0,10.0,11.0,9.0,9.0,10.0,...,10.0,10.0,10.0,10.0,9.0,10.0,10.0,9.0,9.0,3.0


### neural network for multi-label classification

In [4]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]


In [5]:
class MultiLabelNN(torch.nn.Module):
    def __init__(self):
        super(MultiLabelNN, self).__init__()
        self.layer1 = torch.nn.Linear(628, 128)
        self.layer2 = torch.nn.Linear(128, 64)
        self.output = torch.nn.Linear(64, 2)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = torch.sigmoid(self.output(x))  # 使用 sigmoid 激活函数
        return x

model = MultiLabelNN()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data_raw, label.iloc[:, 0:2], test_size=0.2, random_state=42)


In [7]:
# criterion = torch.nn.BCELoss()  # 二元交叉熵损失函数
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# num_epochs = 10
# dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)  # Define the dataloader

# for epoch in range(num_epochs):
#     for inputs, labels in dataloader:
#         outputs = model(inputs)
#         loss = criterion(outputs, labels)

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#     print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

In [8]:
# data_raw = data_raw.drop(['FYZ-IgM',
#        '甲流', 'Diagnosis', 'diagnosis_tokenized'], axis=1)

In [9]:
X_train_tensor = torch.FloatTensor(X_train.values)
y_train_tensor = torch.FloatTensor(y_train.values)

X_test_tensor = torch.FloatTensor(X_test.values)
y_test_tensor = torch.FloatTensor(y_test.values)

In [10]:
test_dataset = CustomDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, test_loader):
    model.eval()  # 将模型设置为评估模式
    true_labels = []
    predictions = []

    with torch.no_grad():  # 在评估过程中不计算梯度
        for inputs, labels in test_loader:
            outputs = model(inputs)
            predicted = outputs > 0.5  # 使用阈值0.5来确定标签

            true_labels.append(labels.numpy())
            predictions.append(predicted.numpy())

    # 将列表转换为二维 NumPy 数组
    true_labels = np.vstack(true_labels)
    predictions = np.vstack(predictions)

    # 计算性能指标
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, average='macro')
    recall = recall_score(true_labels, predictions, average='macro')
    f1 = f1_score(true_labels, predictions, average='macro')

    return accuracy, precision, recall, f1

In [12]:
accuracy, precision, recall, f1 = evaluate_model(model, test_loader)
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

: 

: 