<a href="https://colab.research.google.com/github/Irenempax/Irenempax/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Set Up Google Colab Environment
from google.colab import drive
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.signal import find_peaks

# Mount Google Drive
drive.mount('/content/drive')

# Step 2: Load the S2.pkl File
pkl_file_path = '/content/drive/My Drive/wesad/WESAD/WESAD/S2/S2.pkl'  # Update the path if needed
with open(pkl_file_path, 'rb') as f:
    data = pickle.load(f, encoding='latin1')  # Update to specify encoding

# Step 3: Check Available Signals
print("Available signal keys:", data['signal'].keys())  # Should show 'chest' and 'wrist'

# Check what is inside the 'chest' key
chest_signals = data['signal']['chest']
print("Chest signal structure:", chest_signals.keys())  # Print keys to see available data

# Step 4: Extract ECG Data from Chest Signal
try:
    ecg_data = chest_signals['ECG'][:, 0]  # Access ECG data from the structure
except KeyError as e:
    print(f"Error extracting ECG data: {e}")
    ecg_data = None  # Set ecg_data to None in case of error

# Check if ecg_data was successfully extracted
if ecg_data is None:
    print("Unable to extract ECG data. Check the dataset structure.")
else:
    # Step 5: Define Functions to Calculate HR and RMSSD
    def calculate_hr(ecg_data, sampling_rate=1000):
        # Detect peaks in the ECG data
        peaks, _ = find_peaks(ecg_data, distance=sampling_rate/2.0)  # Minimum distance between peaks
        hr_values = np.zeros(len(peaks) - 1)  # HR values for each interval

        for i in range(1, len(peaks)):
            # Calculate the interval between peaks in seconds
            interval = (peaks[i] - peaks[i-1]) / sampling_rate
            hr_values[i-1] = 60 / interval  # Convert to beats per minute (BPM)

        return hr_values  # Return array of HR values

    def calculate_rmssd(hr_values):
        # Calculate RMSSD from HR values
        if len(hr_values) < 2:
            return np.nan  # Not enough values to calculate RMSSD
        diff = np.diff(hr_values)
        rmssd = np.sqrt(np.mean(diff**2))
        return rmssd

    # Step 6: Calculate HR and RMSSD Features
    hr_values = calculate_hr(ecg_data)  # Calculate HR values from ECG
    rmssd_value = calculate_rmssd(hr_values)  # Calculate RMSSD from HR values

    # Debugging: Check the output of hr_values and rmssd
    print("HR Values:", hr_values)
    print("RMSSD Value:", rmssd_value)

    # Step 7: Prepare Features and Labels
    # Create a DataFrame with features (HR and RMSSD)
    features = pd.DataFrame({
        'HR': hr_values,
        'RMSSD': [rmssd_value] * len(hr_values)  # Repeat RMSSD value for all HR values
    })

    # Assuming you have labels in the data; replace this with your actual label extraction
    labels = data['label']  # Adjust this if necessary
    labels_df = pd.DataFrame(labels, columns=['Label'])

    # Ensure the lengths of features and labels match
    if len(features) != len(labels_df):
        print(f"Length mismatch: Features ({len(features)}) vs Labels ({len(labels_df)})")
    else:
        # Combine features and labels
        combined_df = pd.concat([features, labels_df], axis=1)

        # Step 8: Split the Data
        X = combined_df[['HR', 'RMSSD']]
        y = combined_df['Label']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        # Check the class distribution in training and test sets
        print("Class distribution in training set:")
        print(pd.Series(y_train).value_counts())
        print("Class distribution in test set:")
        print(pd.Series(y_test).value_counts())

        # Step 9: Train Machine Learning Model
        model = LogisticRegression(max_iter=1000)
        model.fit(X_train, y_train)

        # Step 10: Make Predictions and Evaluate the Model
        y_pred = model.predict(X_test)
        print(confusion_matrix(y_test, y_pred))
        print(classification_report(y_test, y_pred))

        # Step 11: Visualize Results
        confusion_mtx = confusion_matrix(y_test, y_pred)

        plt.figure(figsize=(8, 6))
        sns.heatmap(confusion_mtx, annot=True, fmt='d', cmap='Blues')
        plt.ylabel('Actual')
        plt.xlabel('Predicted')
        plt.title('Confusion Matrix')
        plt.show()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Available signal keys: dict_keys(['chest', 'wrist'])
Chest signal structure: dict_keys(['ACC', 'ECG', 'EMG', 'EDA', 'Temp', 'Resp'])
HR Values: [111.11111111 117.87819253 111.73184358 ...  82.30452675  82.75862069
  85.47008547]
RMSSD Value: 13.213930003007187
Length mismatch: Features (6920) vs Labels (4255300)


In [None]:
# Step 2: Load the S2.pkl File
pkl_file_path = '/content/drive/My Drive/wesad/WESAD/WESAD/S2/S2.pkl'  # Update the path if needed
with open(pkl_file_path, 'rb') as f:
    data = pickle.load(f, encoding='latin1')

# Check the structure and shape of labels
labels = data['label']  # Adjust this if necessary
print("Labels shape:", labels.shape)

# Check unique labels to understand the class distribution
unique_labels, counts = np.unique(labels, return_counts=True)
label_distribution = dict(zip(unique_labels, counts))
print("Label distribution:", label_distribution)


Labels shape: (4255300,)
Label distribution: {0: 2142701, 1: 800800, 2: 430500, 3: 253400, 4: 537599, 6: 45500, 7: 44800}


In [None]:
import pandas as pd
import numpy as np
import pickle
from scipy.signal import find_peaks
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler

# Step 1: Load the S2.pkl File
pkl_file_path = '/content/drive/My Drive/wesad/WESAD/WESAD/S2/S2.pkl'  # Update the path if needed
with open(pkl_file_path, 'rb') as f:
    data = pickle.load(f, encoding='latin1')

# Step 2: Extract the ECG signal and labels
ecg_signal = data['signal']['chest']['ECG']
labels = data['label']

# Step 3: Filter labels to focus on 1, 2, and 3
filtered_indices = np.isin(labels, [1, 2, 3])
filtered_ecg = ecg_signal[filtered_indices]
filtered_labels = labels[filtered_indices]

# Ensure the ECG signal is a 1-D array
filtered_ecg = filtered_ecg.flatten()

# Check label distribution
unique, counts = np.unique(filtered_labels, return_counts=True)
label_distribution = dict(zip(unique, counts))
print("Filtered label distribution:", label_distribution)

# Check if we have enough samples
if len(label_distribution) < 2:
    raise ValueError("Insufficient classes available in the filtered data.")

# Step 4: Calculate Heart Rate (HR)
def calculate_hr(ecg_signal, sampling_rate=128):
    # Find R-peaks in the ECG signal
    peaks, _ = find_peaks(ecg_signal, distance=sampling_rate * 0.6)  # Minimum 0.6 seconds between peaks
    rr_intervals = np.diff(peaks) / sampling_rate  # Convert sample indices to seconds
    hr_values = 60 / rr_intervals  # Convert RR intervals to HR (bpm)
    return hr_values

# Step 5: Calculate RMSSD
def calculate_rmssd(hr_values):
    return np.sqrt(np.mean(np.square(np.diff(hr_values))))

# Step 6: Extract HR and RMSSD features
hr_values = calculate_hr(filtered_ecg)
rmssd_value = calculate_rmssd(hr_values)

# Create a DataFrame for features
features_df = pd.DataFrame({'HR': hr_values, 'RMSSD': [rmssd_value] * len(hr_values)})

# Ensure the labels match the length of features_df
labels_df = filtered_labels[:len(features_df)]

# Check label distribution before oversampling
unique, counts = np.unique(labels_df, return_counts=True)
label_distribution_before = dict(zip(unique, counts))
print("Label distribution before oversampling:", label_distribution_before)

# Step 7: Apply Random Oversampling if we have at least 2 classes
if len(unique) > 1:
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(features_df, labels_df)

    # Check the distribution after oversampling
    unique, counts = np.unique(y_resampled, return_counts=True)
    print("Resampled label distribution:", dict(zip(unique, counts)))

    # Step 8: Train-test split with stratification
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

    # Step 9: Train Machine Learning Model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Step 10: Make Predictions
    y_pred = model.predict(X_test)

    # Step 11: Calculate Accuracy and Confusion Matrix
    accuracy = accuracy_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)

    # Step 12: Display Results
    print(f'Accuracy: {accuracy:.2f}')
    print('Confusion Matrix:')
    print(confusion)

    # Step 13: Optional - Visualize the confusion matrix
    plt.figure(figsize=(8, 6))
    plt.imshow(confusion, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(3)  # Adjust according to the number of classes you have
    plt.xticks(tick_marks, ['1', '2', '3'])
    plt.yticks(tick_marks, ['1', '2', '3'])

    # Loop over data dimensions and create text annotations
    thresh = confusion.max() / 2.
    for i, j in np.ndindex(confusion.shape):
        plt.text(j, i, format(confusion[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if confusion[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()
else:
    print("Insufficient diversity in labels for training.")


Filtered label distribution: {1: 800800, 2: 430500, 3: 253400}
Label distribution before oversampling: {1: 14863}
Insufficient diversity in labels for training.
