In [None]:
!pip install kaggle

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d shayanfazeli/heartbeat

!unzip -q heartbeat.zip


Dataset URL: https://www.kaggle.com/datasets/shayanfazeli/heartbeat
License(s): unknown
Downloading heartbeat.zip to /content
  0% 0.00/98.8M [00:00<?, ?B/s]
100% 98.8M/98.8M [00:00<00:00, 1.76GB/s]


In [None]:
import pandas as pd
import numpy as np
train = pd.read_csv('mitbih_train.csv', header=None)
test = pd.read_csv('mitbih_test.csv', header=None)

In [None]:
import os
print(os.listdir())

['.config', 'mitbih_train.csv', 'kaggle.json', 'ptbdb_normal.csv', 'mitbih_test.csv', 'ptbdb_abnormal.csv', 'heartbeat.zip', 'sample_data']


In [None]:
print(f"Training samples: {train.shape[0]}, Features: {train.shape[1]-1}")
print(f"Test samples: {test.shape[0]}")
print(f"Classes distribution:\n{train.iloc[:,-1].value_counts()}")

Training samples: 87554, Features: 187
Test samples: 21892
Classes distribution:
187
0.0    72471
4.0     6431
2.0     5788
1.0     2223
3.0      641
Name: count, dtype: int64


In [None]:
class_names = {
    0: 'N (Normal)',
    1: 'S (AFib-like)',
    2: 'V (PVC-like)',
    3: 'F (Fusion)',
    4: 'Q (Unknown)'
}

print("Training samples distribution:")
for cls, count in train.iloc[:, -1].value_counts().sort_index().items():
    print(f"{class_names[cls]}: {count}")

Training samples distribution:
N (Normal): 72471
S (AFib-like): 2223
V (PVC-like): 5788
F (Fusion): 641
Q (Unknown): 6431


In [None]:
train.isnull().sum()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
183,0
184,0
185,0
186,0


In [None]:
train.duplicated().sum()

np.int64(0)

In [None]:
import numpy as np
from scipy import stats
import pandas as pd

def extract_features(signal):

    features = []
    features.append(np.mean(signal))
    features.append(np.std(signal))
    features.append(np.max(signal))
    features.append(np.min(signal))
    features.append(np.ptp(signal)) //max - min
    features.append(np.median(signal))
    features.append(np.percentile(signal, 25))
    features.append(np.percentile(signal, 75))
    features.append(stats.skew(signal))
    features.append(stats.kurtosis(signal))

    features.append(np.var(signal))
    features.append(np.sqrt(np.mean(signal**2)))
    features.append(np.sum(np.abs(np.diff(signal))))

    fft_vals = np.abs(np.fft.fft(signal))
    fft_vals = fft_vals[:len(fft_vals)//2]
    features.append(np.sum(fft_vals))
    features.append(np.argmax(fft_vals))

    return features

X_train = np.array([extract_features(train.iloc[i, :-1].values) for i in range(len(train))])
y_train = train.iloc[:, -1].values

X_test = np.array([extract_features(test.iloc[i, :-1].values) for i in range(len(test))])
y_test = test.iloc[:, -1].values

print(f"Train Featuers: {X_train.shape}")
print(f"Test Featuers:  {X_test.shape}")

Train Featuers: (87554, 15)
Test Featuers:  (21892, 15)


In [None]:
# !pip install imblearn

# from imblearn.over_sampling import SMOTE
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import classification_report

# smote = SMOTE(random_state=42, k_neighbors=3)
# X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# print("بعد SMOTE:")
# print(pd.Series(y_train_smote).value_counts())

# rf_smote = RandomForestClassifier(n_estimators=300, max_depth=20, random_state=42, n_jobs=-1)
# rf_smote.fit(X_train_smote, y_train_smote)

# y_pred_smote = rf_smote.predict(X_test)
# y_pred_train_smote = rf_smote.predict(X_train_smote)
# print(classification_report(y_test, y_pred_smote, target_names=['N', 'S', 'V', 'F', 'Q']))
# print(classification_report(y_train_smote, y_pred_train_smote , target_names=['N', 'S', 'V', 'F', 'Q']))

بعد SMOTE:
0.0    72471
1.0    72471
2.0    72471
3.0    72471
4.0    72471
Name: count, dtype: int64
              precision    recall  f1-score   support

           N       0.98      0.92      0.95     18118
           S       0.39      0.65      0.49       556
           V       0.69      0.86      0.76      1448
           F       0.34      0.75      0.47       162
           Q       0.87      0.90      0.88      1608

    accuracy                           0.91     21892
   macro avg       0.65      0.82      0.71     21892
weighted avg       0.93      0.91      0.92     21892

              precision    recall  f1-score   support

           N       1.00      0.97      0.98     72471
           S       0.99      1.00      0.99     72471
           V       0.99      0.99      0.99     72471
           F       0.99      1.00      0.99     72471
           Q       1.00      1.00      1.00     72471

    accuracy                           0.99    362355
   macro avg       0.99      

In [None]:
# import xgboost as xgb
# from sklearn.utils.class_weight import compute_class_weight

# classes = np.unique(y_train)
# weights = compute_class_weight('balanced', classes=classes, y=y_train)
# class_weight_dict = dict(zip(classes, weights))


# xgb_model = xgb.XGBClassifier(
#     n_estimators=300,
#     max_depth=10,
#     learning_rate=0.1,
#     random_state=42,
#     use_label_encoder=False,
#     eval_metric='mlogloss'
# )
# xgb_model.fit(X_train, y_train, sample_weight=[class_weight_dict[y] for y in y_train])
# y_pred_xgb = xgb_model.predict(X_test)
# y_pred_train_xgb = xgb_model.predict(X_train)
# print(classification_report(y_test, y_pred_xgb, target_names=['N', 'S', 'V', 'F', 'Q']))
# print(classification_report(y_train, y_pred_train_xgb, target_names=['N', 'S', 'V', 'F', 'Q']))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           N       0.97      0.96      0.96     18118
           S       0.57      0.60      0.59       556
           V       0.79      0.83      0.81      1448
           F       0.50      0.68      0.58       162
           Q       0.87      0.90      0.89      1608

    accuracy                           0.94     21892
   macro avg       0.74      0.79      0.76     21892
weighted avg       0.94      0.94      0.94     21892

              precision    recall  f1-score   support

           N       1.00      0.98      0.99     72471
           S       0.86      1.00      0.93      2223
           V       0.93      1.00      0.96      5788
           F       0.81      1.00      0.90       641
           Q       0.95      1.00      0.97      6431

    accuracy                           0.99     87554
   macro avg       0.91      1.00      0.95     87554
weighted avg       0.99      0.99      0.99     87554



In [None]:
!pip install imbalanced-learn
from imblearn.ensemble import BalancedRandomForestClassifier
import numpy as np
from sklearn.metrics import classification_report

brf = BalancedRandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
brf.fit(X_train, y_train)
y_pred_brf = brf.predict(X_test)
y_pred_train_brf = brf.predict(X_train)
print(classification_report(y_test, y_pred_brf, target_names=['N', 'S', 'V', 'F', 'Q']))
print(classification_report(y_train, y_pred_train_brf, target_names=['N', 'S', 'V', 'F', 'Q']))

              precision    recall  f1-score   support

           N       0.98      0.78      0.87     18118
           S       0.20      0.71      0.32       556
           V       0.48      0.86      0.61      1448
           F       0.14      0.89      0.25       162
           Q       0.69      0.89      0.78      1608

    accuracy                           0.79     21892
   macro avg       0.50      0.82      0.56     21892
weighted avg       0.90      0.79      0.83     21892

              precision    recall  f1-score   support

           N       0.99      0.79      0.88     72471
           S       0.27      0.91      0.41      2223
           V       0.50      0.90      0.64      5788
           F       0.17      1.00      0.29       641
           Q       0.70      0.91      0.80      6431

    accuracy                           0.81     87554
   macro avg       0.53      0.90      0.60     87554
weighted avg       0.91      0.81      0.84     87554



In [None]:
import joblib
joblib.dump(brf, 'balanced_rf_ecg.pkl')
print("Balanced RF saved")

from google.colab import files
files.download('balanced_rf_ecg.pkl')

Balanced RF saved


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
feature_names = [
    'mean', 'std', 'max', 'min', 'ptp', 'median', 'p25', 'p75',
    'skew', 'kurtosis', 'var', 'rms', 'total_var', 'fft_sum', 'fft_peak'
]

df_test = pd.DataFrame(X_test, columns=feature_names)
df_test['true_class'] = y_test
df_test['class_name'] = df_test['true_class'].map({
    0: 'N (Normal)',
    1: 'S (AFib-like)',
    2: 'V (PVC-like)',
    3: 'F (Fusion)',
    4: 'Q (Unknown)'
})

print("\nMean")
class_means = df_test.groupby('class_name')[feature_names].mean()
print(class_means.round(4))

print("\nstd")
class_stds = df_test.groupby('class_name')[feature_names].std()
print(class_stds.round(4))

In [None]:
important = ['mean', 'std', 'skew', 'kurtosis', 'rms', 'fft_peak']
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, feat in enumerate(important):
    for cls in range(5):
        data = df_test[df_test['true_class'] == cls][feat]
        axes[i].boxplot(data, positions=[cls], widths=0.6, patch_artist=True,
                        boxprops=dict(facecolor=f'C{cls}'))
    axes[i].set_title(f'Feature: {feat}')
    axes[i].set_xlabel('Class')
    axes[i].set_xticks(range(5))
    axes[i].set_xticklabels(['N', 'S', 'V', 'F', 'Q'])

plt.tight_layout()
plt.show()

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModel
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report


if 'X_train_pad' not in locals():
    def pad_to_length(signal, target_len=200):
        if len(signal) < target_len:
            return np.pad(signal, (0, target_len - len(signal)), 'constant')
        return signal[:target_len]

    train = pd.read_csv('mitbih_train.csv', header=None)
    test = pd.read_csv('mitbih_test.csv', header=None)
    X_train_raw = train.iloc[:, :-1].values
    y_train = train.iloc[:, -1].values
    X_test_raw = test.iloc[:, :-1].values
    y_test = test.iloc[:, -1].values
    X_train_pad = np.array([pad_to_length(x, 200) for x in X_train_raw])
    X_test_pad = np.array([pad_to_length(x, 200) for x in X_test_raw])
    print("Data padded.")

#  DataLoaders
train_dataset = TensorDataset(
    torch.tensor(X_train_pad, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.long)
)
test_dataset = TensorDataset(
    torch.tensor(X_test_pad, dtype=torch.float32),
    torch.tensor(y_test, dtype=torch.long)
)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

class ECGClassifier(nn.Module):
    def __init__(self, base_model, num_classes=5):
        super().__init__()
        self.base = base_model
        self.classifier = nn.Linear(base_model.config.hidden_size, num_classes)

    def forward(self, x, sampling_rate=125):
        outputs = self.base(x, sampling_rate=sampling_rate)
        features = outputs.last_hidden_state.mean(dim=1)
        return self.classifier(features)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model_name = "Edoardo-BS/hubert-ecg-small"
base_model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
model = ECGClassifier(base_model, num_classes=5).to(device)

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

epochs = 15
train_losses = []
val_accuracies = []

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for signals, labels in train_loader:
        signals, labels = signals.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(signals, sampling_rate=125)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)

    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for signals, labels in test_loader:
            signals, labels = signals.to(device), labels.to(device)
            outputs = model(signals, sampling_rate=125)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    val_accuracies.append(accuracy)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Test Acc: {accuracy:.2f}%")

model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for signals, labels in test_loader:
        signals = signals.to(device)
        outputs = model(signals, sampling_rate=125)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.numpy())

target_names = ['N (Normal)', 'S (AFib-like)', 'V (PVC-like)', 'F (Fusion)', 'Q (Unknown)']
print("\n=== Classification Report (Pretrained Model) ===")
print(classification_report(all_labels, all_preds, target_names=target_names))

torch.save(model.state_dict(), 'hubert_ecg_finetuned.pth')
print("Model saved as hubert_ecg_finetuned.pth")