In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load dataset
data_path = r'C:\Users\Admin\Desktop\ML online\final project\healthcare-dataset-stroke-data.csv'
data = pd.read_csv(data_path)

# Check missing values
print("Missing values in each column:\n", data.isnull().sum())

# Fill missing values in 'bmi' column
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())

# Encode categorical features
label_encoders = {}
for column in ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Standardize numerical features
scaler = StandardScaler()
data[['age', 'avg_glucose_level', 'bmi']] = scaler.fit_transform(data[['age', 'avg_glucose_level', 'bmi']])

# Split features and target
X = data.drop(columns=['id', 'stroke'])
y = data['stroke']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print("Training indices:", X_train.index[:10])
print("Testing indices:", X_test.index[:10])


Missing values in each column:
 id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64
Training indices: Index([434, 725, 783, 2912, 2329, 599, 390, 2730, 2838, 3837], dtype='int64')
Testing indices: Index([184, 47, 1746, 3307, 2615, 1406, 1853, 2902, 2829, 2713], dtype='int64')


In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import RandomOverSampler
import numpy as np

# Define Transformer-based binary classifier with increased model complexity
class TransformerBinaryClassifier(nn.Module):
    def __init__(self, input_dim):
        super(TransformerBinaryClassifier, self).__init__()
        self.embedding = nn.Linear(input_dim, 256)  # Increase embedding dimension
        encoder_layer = nn.TransformerEncoderLayer(d_model=256, nhead=8, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=8)  # Increase Transformer layers
        self.fc = nn.Linear(256, 1)  # Output one value for binary classification
    
    def forward(self, x):
        x = torch.tensor(x, dtype=torch.float32) if not isinstance(x, torch.Tensor) else x
        x = self.embedding(x).unsqueeze(1)  # Shape: (batch_size, 1, d_model)
        x = self.transformer_encoder(x)  # Output shape: (batch_size, 1, d_model)
        x = x.mean(dim=1)  # Apply mean pooling
        return self.fc(x)  # Shape: (batch_size, 1)

# Prepare data and apply oversampling to balance classes
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Create DataLoader with resampled data
train_dataset = TensorDataset(torch.tensor(X_resampled.values, dtype=torch.float32), torch.tensor(y_resampled.values, dtype=torch.float32))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Compute class weights and apply to loss function
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=y_train)
class_weights_tensor = torch.tensor([class_weights[1]], dtype=torch.float32)
criterion = nn.BCEWithLogitsLoss(pos_weight=class_weights_tensor)

# Initialize model, optimizer with reduced learning rate, and learning rate scheduler
input_dim = X_train.shape[1]
model = TransformerBinaryClassifier(input_dim=input_dim)
optimizer = optim.Adam(model.parameters(), lr=1e-4)  # Reduce initial learning rate
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)  # Decay learning rate every 5 epochs

# Training function with best model saving
def train_model(model, train_loader, criterion, optimizer, scheduler, epochs=10):
    model.train()
    best_loss = float('inf')
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs).squeeze(1)  # Shape: (batch_size,)
            loss = criterion(outputs, labels)  # Binary cross-entropy loss
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Step scheduler and save best model
        scheduler.step()
        epoch_loss = running_loss / len(train_loader)
        print(f"Epoch {epoch + 1}, Loss: {epoch_loss:.4f}")
        
        # Save best model based on loss
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            torch.save(model.state_dict(), 'best_model.pth')
            print(f"Best model saved at epoch {epoch + 1} with loss {best_loss:.4f}")

# Train the model
train_model(model, train_loader, criterion, optimizer, scheduler, epochs=10)


Epoch 1, Loss: 1.2670
Best model saved at epoch 1 with loss 1.2670
Epoch 2, Loss: 1.1575
Best model saved at epoch 2 with loss 1.1575
Epoch 3, Loss: 1.1122
Best model saved at epoch 3 with loss 1.1122
Epoch 4, Loss: 1.1048
Best model saved at epoch 4 with loss 1.1048
Epoch 5, Loss: 1.0648
Best model saved at epoch 5 with loss 1.0648
Epoch 6, Loss: 0.9658
Best model saved at epoch 6 with loss 0.9658
Epoch 7, Loss: 0.9244
Best model saved at epoch 7 with loss 0.9244
Epoch 8, Loss: 0.9000
Best model saved at epoch 8 with loss 0.9000
Epoch 9, Loss: 0.8781
Best model saved at epoch 9 with loss 0.8781
Epoch 10, Loss: 0.8660
Best model saved at epoch 10 with loss 0.8660


In [33]:
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, precision_recall_curve

# 确保 y_test 是二维的
if y_test.ndim == 1:
    y_test = y_test.values.reshape(-1, 1)

# 测试数据加载器
test_dataset = TensorDataset(torch.tensor(X_test.values, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))
test_loader = DataLoader(test_dataset, batch_size=32)

# 模型评估
model.eval()
all_preds = []
all_labels = []

# 获取预测概率
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        probs = torch.sigmoid(outputs).squeeze(1)  # 转换为概率
        all_preds.extend(probs.cpu().numpy())
        all_labels.extend(labels.cpu().numpy().flatten())

all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

# 优化阈值，避免计算过程中出现 NaN
def optimize_threshold(y_true, y_pred_probs):
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_pred_probs)
    f1_scores = np.zeros_like(precisions)  # 初始化 F1 分数数组
    
    # 计算 F1 分数，仅在 precisions + recalls 非 0 时计算
    for i in range(len(f1_scores)):
        if precisions[i] + recalls[i] != 0:
            f1_scores[i] = 2 * (precisions[i] * recalls[i]) / (precisions[i] + recalls[i])
    
    # 找到 F1 分数最高的阈值索引
    optimal_idx = np.argmax(f1_scores)
    return thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5  # 如果出界则返回默认阈值

# 找到最佳阈值
optimal_threshold = optimize_threshold(all_labels, all_preds)
print(f"Optimal threshold: {optimal_threshold}")

# 使用最佳阈值生成二进制预测
binarized_preds = (all_preds >= optimal_threshold).astype(int)

# **二分类评估指标**
accuracy = accuracy_score(all_labels, binarized_preds)
precision = precision_score(all_labels, binarized_preds, zero_division=1)
recall = recall_score(all_labels, binarized_preds)
f1 = f1_score(all_labels, binarized_preds)
auc_roc = roc_auc_score(all_labels, all_preds)  # AUC-ROC
average_precision = average_precision_score(all_labels, all_preds)  # AUC-PR

print("Binary Classification Metrics at Optimal Threshold:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC: {auc_roc:.4f}")
print(f"AUC-PR: {average_precision:.4f}")


Optimal threshold: 0.9777963757514954
Binary Classification Metrics at Optimal Threshold:
Accuracy: 0.9384
Precision: 0.3023
Recall: 0.2826
F1 Score: 0.2921
AUC-ROC: 0.8519
AUC-PR: 0.2066


In [19]:
'''
Machine Learning Project Report: Stroke Prediction Model Using Transformer-Based Binary Classifier
Project Objective
The objective of this project is to build a machine learning model to predict the likelihood of a stroke in a given patient based on medical and demographic features. A Transformer-based binary classifier was implemented and trained on a dataset of patient data to classify individuals into "stroke" or "no stroke" categories.

Dataset Description
The dataset contains patient demographic and health-related attributes relevant to stroke prediction, including:

Patient ID: Unique identifier for each patient.
Gender: Patient gender (e.g., Male, Female).
Age: Patient's age.
Hypertension: Presence of hypertension (1 for yes, 0 for no).
Heart Disease: Presence of heart disease (1 for yes, 0 for no).
Ever Married: Marital status (Yes/No).
Work Type: Employment type (e.g., Private, Self-employed).
Residence Type: Rural or Urban.
Average Glucose Level: Average glucose level in blood.
BMI: Body Mass Index.
Smoking Status: Smoking behavior (e.g., never smoked, smokes).
Stroke (Target): Binary label indicating if the patient had a stroke (1 for yes, 0 for no).
Data Preprocessing
To prepare the dataset for training, the following steps were undertaken:

Handling Missing Values:

Missing values in the BMI column were replaced with the column's mean.
Encoding Categorical Features:

Categorical columns such as gender, ever_married, work_type, Residence_type, and smoking_status were encoded using label encoding.
Scaling Numerical Features:

Numerical columns (age, avg_glucose_level, and bmi) were standardized to have zero mean and unit variance.
Feature-Target Split:

The id column was dropped, and the remaining features were used to create the feature matrix (X) and target vector (y).
Train-Test Split:

The dataset was split into training (80%) and testing (20%) sets to evaluate the model's generalization performance.
Model Architecture
The binary classifier utilized a Transformer architecture designed to capture complex interactions among input features. The architecture included the following components:

Input Embedding:

A linear layer projected input features into a 256-dimensional embedding space.
Transformer Encoder:

The model consisted of 8 Transformer encoder layers, each with:
256-dimensional embeddings.
8 attention heads.
Feedforward networks with ReLU activations.
Residual connections and layer normalization for improved gradient flow.
Global Pooling and Output:

Global average pooling aggregated the sequence of feature embeddings.
A fully connected output layer mapped the pooled representation to a single logit for binary classification.
Loss Function:

Binary Cross-Entropy with Logits Loss (BCEWithLogitsLoss) was used, incorporating class weighting to address the imbalance in the dataset.
Model Training
The model was trained using the following hyperparameters:

Optimizer: Adam optimizer.
Learning Rate: 0.0001 with StepLR scheduling (decayed by 0.1 every 5 epochs).
Batch Size: 32.
Epochs: 20.
Dropout Rate: 0.1 to prevent overfitting.
Model Evaluation
Binary Classification Metrics at Optimal Threshold:
Optimal Threshold: 0.9778
Accuracy: 93.84%
Precision: 30.23%
Recall: 28.26%
F1 Score: 29.21%
AUC-ROC: 85.19%
AUC-PR: 20.66%
Observations
Strengths:

The model achieved high accuracy (93.84%), indicating its ability to correctly classify most samples, especially for the majority class (no stroke).
AUC-ROC (85.19%) suggests strong discriminative ability in distinguishing between stroke and non-stroke cases.
Weaknesses:

Precision (30.23%) and recall (28.26%) remained low, indicating difficulties in identifying actual stroke cases.
The low F1 score (29.21%) reflects an imbalance between precision and recall, particularly for the minority class (stroke).
Insights from AUC-PR:

The AUC-PR score of 20.66% highlights challenges in achieving high precision and recall for positive cases, a common issue in imbalanced datasets.
Conclusion
The Transformer-based binary classifier performed well in terms of overall accuracy and AUC-ROC but struggled with recall and F1 score, indicating difficulty in detecting positive stroke cases. The high accuracy score reflects the model's tendency to favor the majority class, which is typical in imbalanced datasets.

Future Improvements
Class Balancing:

Use advanced oversampling techniques such as SMOTE or under-sampling of the majority class.
Threshold Tuning:

Experiment with alternative thresholds to optimize recall for positive cases.
Alternative Loss Functions:

Replace BCEWithLogitsLoss with focal loss to penalize false negatives more heavily.
Feature Engineering:

Include engineered features such as interaction terms (e.g., age × BMI) to enhance predictive performance.
Hyperparameter Optimization:

Further tune embedding size, the number of encoder layers, and attention heads.
Comparative Analysis:

Evaluate simpler models such as logistic regression or decision trees as baselines to validate the complexity of the Transformer-based approach.
'''

'\nMachine Learning Project Report: Stroke Prediction Model Using Transformer-Based Binary Classifier\nProject Objective\nThe objective of this project is to build a machine learning model to predict the likelihood of a stroke in a given patient based on a set of medical and demographic features. \nWe use a Transformer-based binary classifier, trained on a dataset of patient data, to classify each patient into "stroke" or "no stroke" categories.\n\n\nDataset Description\nThe dataset contains information on patient demographics and health metrics relevant to stroke prediction. \nThe features include:\n\nPatient ID: Unique identifier for each patient.\nGender: Patient gender (e.g., Male, Female).\nAge: Patient age.\nHypertension: Whether the patient has hypertension (1) or not (0).\nHeart Disease: Whether the patient has any heart disease (1) or not (0).\nEver Married: Whether the patient has ever been married.\nWork Type: Employment type (e.g., Private, Self-employed).\nResidence Type: 