# EDA

In [8]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [9]:
data_raw = pd.read_csv('../data/curated/combined_data_div_binned.csv')
label = pd.read_csv('../data/curated/combined_label.csv')

print("Number of rows in data =",data_raw.shape[0])
print("Number of columns in data =",data_raw.shape[1])
print("\n")
print("**Sample data:**")
data_raw.head()

Number of rows in data = 7971
Number of columns in data = 528


**Sample data:**


Unnamed: 0,PtAge,WBC,RBC,HGB,HCT,MCV,MCH,MCHC,PLT,RDW-SD,...,CRP_div_HFLC1,CRP_div_HFLC2,CRP_div_NRBC#,CRP_div_NRBC%,HFLC1_div_HFLC2,HFLC1_div_NRBC#,HFLC1_div_NRBC%,HFLC2_div_NRBC#,HFLC2_div_NRBC%,NRBC#_div_NRBC%
0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,3.0,...,3.0,3.0,3.0,3.0,1.0,1.0,1.0,2.0,2.0,3.0
1,2.0,1.0,2.0,1.0,1.0,3.0,3.0,3.0,1.0,2.0,...,3.0,1.0,3.0,3.0,1.0,1.0,1.0,2.0,2.0,3.0
2,2.0,3.0,2.0,1.0,1.0,3.0,3.0,3.0,3.0,1.0,...,3.0,3.0,3.0,3.0,0.0,1.0,1.0,1.0,1.0,3.0
3,2.0,3.0,2.0,1.0,1.0,3.0,3.0,3.0,1.0,1.0,...,3.0,3.0,3.0,3.0,2.0,1.0,1.0,1.0,1.0,3.0
4,1.0,1.0,2.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,...,3.0,3.0,3.0,3.0,1.0,2.0,2.0,3.0,3.0,3.0


In [10]:
data_raw.describe()

Unnamed: 0,PtAge,WBC,RBC,HGB,HCT,MCV,MCH,MCHC,PLT,RDW-SD,...,CRP_div_HFLC1,CRP_div_HFLC2,CRP_div_NRBC#,CRP_div_NRBC%,HFLC1_div_HFLC2,HFLC1_div_NRBC#,HFLC1_div_NRBC%,HFLC2_div_NRBC#,HFLC2_div_NRBC%,NRBC#_div_NRBC%
count,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,...,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0,7971.0
mean,1.858738,1.430059,1.942542,1.686739,1.717852,2.091707,2.29783,2.531928,1.366955,1.391795,...,2.500314,2.481872,2.277004,2.277004,1.19182,1.433321,1.433321,1.81922,1.81922,2.305357
std,0.348674,0.699144,0.237005,0.877345,0.905057,0.8482,0.894915,0.664065,0.767899,0.587556,...,0.837241,0.887253,0.991708,0.991708,0.413345,0.764267,0.764267,0.792887,0.792887,0.951836
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,2.0,1.0,2.0,1.0,1.0,2.0,3.0,3.0,1.0,1.0,...,3.0,3.0,3.0,3.0,1.0,1.0,1.0,2.0,2.0,3.0
75%,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,1.0,2.0,...,3.0,3.0,3.0,3.0,1.0,2.0,2.0,2.0,2.0,3.0
max,2.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0


### neural network for multi-label classification

In [11]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]


In [12]:
class MultiLabelNN(torch.nn.Module):
    def __init__(self):
        super(MultiLabelNN, self).__init__()
        self.layer1 = torch.nn.Linear(628, 128)
        self.layer2 = torch.nn.Linear(128, 64)
        self.output = torch.nn.Linear(64, 2)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = torch.sigmoid(self.output(x))  # 使用 sigmoid 激活函数
        return x

model = MultiLabelNN()

In [13]:
X_train, X_test, y_train, y_test = train_test_split(data_raw, label.iloc[:, 0:2], test_size=0.2, random_state=42)


In [14]:
# criterion = torch.nn.BCELoss()  # 二元交叉熵损失函数
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# num_epochs = 10
# dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)  # Define the dataloader

# for epoch in range(num_epochs):
#     for inputs, labels in dataloader:
#         outputs = model(inputs)
#         loss = criterion(outputs, labels)

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#     print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

In [15]:
# data_raw = data_raw.drop(['FYZ-IgM',
#        '甲流', 'Diagnosis', 'diagnosis_tokenized'], axis=1)

In [16]:
X_train_tensor = torch.FloatTensor(X_train.values)
y_train_tensor = torch.FloatTensor(y_train.values)

X_test_tensor = torch.FloatTensor(X_test.values)
y_test_tensor = torch.FloatTensor(y_test.values)

: 

: 

In [None]:
test_dataset = CustomDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, test_loader):
    model.eval()  # 将模型设置为评估模式
    true_labels = []
    predictions = []

    with torch.no_grad():  # 在评估过程中不计算梯度
        for inputs, labels in test_loader:
            outputs = model(inputs)
            predicted = outputs > 0.5  # 使用阈值0.5来确定标签

            true_labels.append(labels.numpy())
            predictions.append(predicted.numpy())

    # 将列表转换为二维 NumPy 数组
    true_labels = np.vstack(true_labels)
    predictions = np.vstack(predictions)

    # 计算性能指标
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, average='macro')
    recall = recall_score(true_labels, predictions, average='macro')
    f1 = f1_score(true_labels, predictions, average='macro')

    return accuracy, precision, recall, f1

In [None]:
accuracy, precision, recall, f1 = evaluate_model(model, test_loader)
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.6978056426332289
Precision: 0.833423229054297
Recall: 0.5610021786492374
F1 Score: 0.6671454545988609
