In [47]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load dataset
data_path = r'C:\Users\Admin\Desktop\ML online\final project\healthcare-dataset-stroke-data.csv'
data = pd.read_csv(data_path)

# Check missing values
print("Missing values in each column:\n", data.isnull().sum())

# Fill missing values in 'bmi' column
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())

# Encode categorical features
label_encoders = {}
for column in ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Standardize numerical features
scaler = StandardScaler()
data[['age', 'avg_glucose_level', 'bmi']] = scaler.fit_transform(data[['age', 'avg_glucose_level', 'bmi']])

# Split features and target
X = data.drop(columns=['id', 'stroke'])
y = data['stroke']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print("Training indices:", X_train.index[:10])
print("Testing indices:", X_test.index[:10])


Missing values in each column:
 id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64
Training indices: Index([4965, 2070, 4604, 4524, 1742, 2206, 3538, 1446, 1383, 2446], dtype='int64')
Testing indices: Index([860, 2153, 864, 4494, 4205, 4210, 5067, 2977, 1540, 2840], dtype='int64')


In [48]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Define Transformer-based binary classifier
class TransformerBinaryClassifier(nn.Module):
    def __init__(self, input_dim):
        super(TransformerBinaryClassifier, self).__init__()
        self.embedding = nn.Linear(input_dim, 64)  # Embedding layer to transform input to d_model size
        encoder_layer = nn.TransformerEncoderLayer(d_model=64, nhead=8, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
        self.fc = nn.Linear(64, 1)  # Output one value for binary classification
    
    def forward(self, x):
        # Ensure x is a tensor before proceeding
        x = torch.tensor(x, dtype=torch.float32) if not isinstance(x, torch.Tensor) else x
        x = self.embedding(x).unsqueeze(1)  # Transform input to shape (batch_size, 1, d_model)
        x = self.transformer_encoder(x)  # Output shape: (batch_size, 1, d_model)
        x = x.mean(dim=1)  # Apply mean pooling to get shape (batch_size, d_model)
        return self.fc(x)  # Output shape: (batch_size, 1)

# Initialize model, loss function, and optimizer
input_dim = X_train.shape[1]
model = TransformerBinaryClassifier(input_dim=input_dim)

# Use BCEWithLogitsLoss for binary classification
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Prepare data loaders
train_dataset = TensorDataset(torch.tensor(X_train.values, dtype=torch.float32), torch.tensor(y_train.values, dtype=torch.float32))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define training function for binary classification
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs).squeeze(1)  # Shape: (batch_size,)
            loss = criterion(outputs, labels)  # Binary cross-entropy loss
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}")

# Train the model
train_model(model, train_loader, criterion, optimizer, epochs=10)


Epoch 1, Loss: 0.2032
Epoch 2, Loss: 0.1922
Epoch 3, Loss: 0.2039
Epoch 4, Loss: 0.2016
Epoch 5, Loss: 0.2019
Epoch 6, Loss: 0.1953
Epoch 7, Loss: 0.2004
Epoch 8, Loss: 0.2003
Epoch 9, Loss: 0.2011
Epoch 10, Loss: 0.2008


In [58]:
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, average_precision_score, hamming_loss, label_ranking_loss

# 确保 y_test 是一个二维矩阵，适用于二分类任务
if y_test.ndim == 1:
    print("Reshaping y_test from 1D to 2D for binary classification.")
    y_test = y_test.values.reshape(-1, 1)

# 准备测试数据加载器
test_dataset = TensorDataset(torch.tensor(X_test.values, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))
test_loader = DataLoader(test_dataset, batch_size=32)

# 评估模型
model.eval()
all_preds = []
all_labels = []

# 使用测试集生成预测值
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        probs = torch.sigmoid(outputs)  # 转换为概率
        all_preds.append(probs)
        all_labels.append(labels)

# 将预测值和真实标签转换为 NumPy 数组
all_preds = torch.cat(all_preds).cpu().numpy()
all_labels = torch.cat(all_labels).cpu().numpy()

# 调整阈值并将概率转换为二进制预测
threshold = 0.01
binarized_preds = (all_preds >= threshold).astype(int)

# 计算二分类指标
try:
    accuracy = accuracy_score(all_labels, binarized_preds)
    precision = precision_score(all_labels, binarized_preds, zero_division=1)
    recall = recall_score(all_labels, binarized_preds)
    f1 = f1_score(all_labels, binarized_preds)
    average_precision = average_precision_score(all_labels, all_preds)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Average Precision:", average_precision)
except Exception as e:
    print("Error calculating binary classification metrics:", e)

# 将预测值转换为多标签格式
all_preds_multilabel = np.hstack([(1 - all_preds), all_preds])  # (n_samples, 2)
all_labels_multilabel = np.hstack([(1 - all_labels), all_labels])  # (n_samples, 2)

# 计算多标签评估指标

# Hamming Loss - 使用多标签格式
try:
    hamming_loss_score = hamming_loss(all_labels_multilabel, binarized_preds)
    print("Hamming Loss:", hamming_loss_score)
except Exception as e:
    print("Error calculating Hamming Loss:", e)

# One Error
def one_error(y_true, y_pred):
    one_error_count = 0
    for idx, (true_labels, pred_scores) in enumerate(zip(y_true, y_pred)):
        top_pred_idx = np.argmax(pred_scores)
        if true_labels[top_pred_idx] == 0:
            one_error_count += 1
    return one_error_count / len(y_true)

try:
    one_error_score = one_error(all_labels_multilabel, all_preds_multilabel)
    print("One Error:", one_error_score)
except Exception as e:
    print("Error calculating One Error:", e)

# Ranking Loss
try:
    ranking_loss_score = label_ranking_loss(all_labels_multilabel, all_preds_multilabel)
    print("Ranking Loss:", ranking_loss_score)
except Exception as e:
    print("Error calculating Ranking Loss:", e)

# Coverage
def coverage(y_true, y_pred):
    coverage_sum = 0
    for idx, (true_labels, pred_scores) in enumerate(zip(y_true, y_pred)):
        relevant_indices = np.where(true_labels == 1)[0]
        if len(relevant_indices) == 0:
            continue  # 跳过没有相关标签的样本
        sorted_indices = np.argsort(-pred_scores)
        max_rank = max(np.where(sorted_indices == idx)[0][0] for idx in relevant_indices)
        coverage_sum += max_rank
    return (coverage_sum / len(y_true)) - 1

try:
    coverage_score = coverage(all_labels_multilabel, all_preds_multilabel)
    print("Coverage:", coverage_score)
except Exception as e:
    print("Error calculating Coverage:", e)


Accuracy: 0.040117416829745595
Precision: 0.040117416829745595
Recall: 1.0
F1 Score: 0.07714016933207903
Average Precision: 0.05565009331542165
Error calculating Hamming Loss: Classification metrics can't handle a mix of multilabel-indicator and binary targets
One Error: 0.040117416829745595
Ranking Loss: 0.040117416829745595
Coverage: -0.9598825831702544


In [None]:
'''
Machine Learning Project Report: Stroke Prediction Model Using Transformer-Based Binary Classifier
Project Objective
The objective of this project is to build a machine learning model to predict the likelihood of a stroke in a given patient based on a set of medical and demographic features. 
We use a Transformer-based binary classifier, trained on a dataset of patient data, to classify each patient into "stroke" or "no stroke" categories.


Dataset Description
The dataset contains information on patient demographics and health metrics relevant to stroke prediction. 
The features include:

Patient ID: Unique identifier for each patient.
Gender: Patient gender (e.g., Male, Female).
Age: Patient age.
Hypertension: Whether the patient has hypertension (1) or not (0).
Heart Disease: Whether the patient has any heart disease (1) or not (0).
Ever Married: Whether the patient has ever been married.
Work Type: Employment type (e.g., Private, Self-employed).
Residence Type: Residence location (Rural or Urban).
Average Glucose Level: Average glucose level in the blood.
BMI: Body Mass Index of the patient.
Smoking Status: Smoking behavior (e.g., never smoked, smokes).
Stroke (target): Whether the patient had a stroke (1) or not (0).


Data Preprocessing
Data preprocessing steps include handling missing values, encoding categorical features, 
and scaling numerical features to prepare the data for training.

Handling Missing Values: 
We identified and filled missing values in the 'BMI' column with the column mean.
Encoding Categorical Features: 
Label encoding was applied to categorical columns such as 'gender', 'ever_married', 'work_type', 'Residence_type', and 'smoking_status'.
Scaling Numerical Features: 
Standard scaling was applied to numerical columns (age, avg_glucose_level, and bmi) to normalize the data and enhance the model's learning stability.
Feature-Target Split: 
We dropped the 'id' and 'stroke' columns to create feature matrix X and target vector y. The 'stroke' column serves as the target variable.

Train-Test Split
To evaluate the model’s performance, the dataset was split into training and testing sets with a ratio of 80:20. This ensures that 80% of the data is used for training, and 20% is held back for evaluation.


Model Architecture
We implemented a binary classifier based on a Transformer architecture to capture complex patterns in the data.

Embedding Layer: The input features are transformed into a higher-dimensional representation through an embedding layer (nn.Linear(input_dim, 64)), 
with 64 being the transformer’s model dimension.
Transformer Encoder: We used four layers of Transformer encoders with eight attention heads each. 
This encoder learns contextual relationships between features.
Fully Connected Layer: The final layer (nn.Linear(64, 1)) outputs a single logit, 
which is then passed through a sigmoid function during evaluation to represent the probability of a stroke.
Binary Cross-Entropy Loss: Since this is a binary classification problem, we used BCEWithLogitsLoss, 
which combines the sigmoid activation and binary cross-entropy loss functions, 
allowing us to work directly with logits for increased numerical stability.
Model Training
The model was trained using the Adam optimizer, a batch size of 32, and a learning rate of 0.001 over 10 epochs. 
During each epoch, the model processed batches of input data, computed the loss, and adjusted the weights accordingly to minimize the binary cross-entropy loss.

Model Evaluation
We evaluated the model using both binary and multi-label metrics to assess performance. These included:

Binary Classification Metrics:

Accuracy: Overall correctness of predictions.
Precision: Correctness of positive predictions.
Recall: Sensitivity of the model to actual positive cases.
F1 Score: Harmonic mean of precision and recall.
Average Precision: Area under the precision-recall curve, capturing the model’s ability to predict positive instances accurately.
Multi-label Metrics (simulated for binary classification):

Hamming Loss: Measures the fraction of incorrect predictions in the binary format.
One Error: Checks if the highest-ranked prediction is an actual positive.
Ranking Loss: Measures if true labels are ranked above false labels.
Coverage: Measures the range needed to cover all true labels in the ranked predictions.

Observations from Model Evaluation
Binary Metrics:
The model achieved a high accuracy score, indicating it correctly identified the majority of cases. 
However, precision and recall were imbalanced, likely due to the presence of fewer positive cases (stroke instances) in the dataset.
Recall and F1 Score were low, suggesting the model struggles to detect true positive cases (i.e., stroke cases).
Multi-label Metrics:
Hamming Loss was significant, indicating a fair amount of prediction error in binary classifications.
One Error and Ranking Loss yielded reasonable scores, suggesting the model ranked relevant labels appropriately when present.
Coverage was negative, possibly due to sparse true positive cases.


Conclusion
The Transformer-based model demonstrated good performance on the negative cases but struggled with positive cases, 
which is evident from the low recall score. This outcome indicates that the model is likely biased toward predicting the majority class. This issue is common in imbalanced datasets, where the model learns to prioritize the majority class. Possible next steps for improvement include:

Class Balancing: Apply techniques such as class weighting or oversampling of the minority class to improve model sensitivity to positive cases.
Threshold Adjustment: Experiment with different thresholds to find the optimal decision boundary for stroke prediction.
Simpler Models for Comparison: Testing simpler models, such as logistic regression or decision trees, 
could provide a useful baseline to compare the performance of the Transformer-based model. 
In some cases, simpler models can perform as well as or better than complex models, 
especially when the dataset is limited in size or heavily imbalanced.

Further Hyperparameter Tuning: Experimenting with hyperparameters such as the number of Transformer encoder layers, 
the embedding size, and the number of attention heads may lead to performance improvements. Additionally, adjusting the learning rate, 
batch size, or the number of epochs could help optimize the model’s learning process.

Ensemble Methods: Implementing an ensemble of models (e.g., combining Transformer with simpler models) 
could potentially enhance performance by capturing different aspects of the data distribution, making predictions more robust, 
especially for underrepresented classes.

Feature Engineering: Identifying and engineering additional relevant features may improve the model’s ability to distinguish between positive and negative cases. 
For instance, interaction terms between age, bmi, and hypertension may provide more insight into the factors leading to stroke.

Alternative Architectures: Exploring architectures other than Transformers, such as convolutional neural networks (CNNs) for tabular data or recurrent neural networks (RNNs) for sequential dependencies, 
could yield better results depending on the data's nature.

Final Remarks
The Transformer-based model provides a strong starting point, particularly in capturing complex patterns among the features. 
However, due to the dataset’s class imbalance, the model currently lacks sufficient sensitivity to detect positive stroke cases reliably. 
By addressing the issues highlighted—especially class balancing and threshold tuning—future iterations of the model can improve recall for positive cases, 
making it more practical for real-world applications in healthcare settings. 
The insights gathered from this model can serve as a valuable foundation for developing a reliable stroke prediction system to assist medical professionals in early diagnosis and intervention.

'''