In [1]:
import pandas as pd
import numpy as np
import shap
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('output/standardized_features.csv')
validation_data = pd.read_csv('output/standardized_features_validation.csv')  # Load the validation data

# Map class labels to numeric values (0, 1, 2, 3)
class_mapping = {
    'Normal': 0,
    'Complex Partial Seizures': 1,
    'Electrographic Seizures': 2,
    'Video detected Seizures with no visual change over EEG': 3
}

# Apply the mapping to the 'class' column and create a new 'label' column for training data
data['label'] = data['class'].map(class_mapping)

# Apply the mapping to the validation data
validation_data['label'] = validation_data['class'].map(class_mapping)

# Check for missing values in 'label' column
print(f"Missing values in 'label' in train data: {data['label'].isnull().sum()}")
print(f"Missing values in 'label' in validation data: {validation_data['label'].isnull().sum()}")

# Drop rows with missing target values in both datasets
data = data.dropna(subset=['label'])
validation_data = validation_data.dropna(subset=['label'])

# Now the target variable is 'label' and the rest are features
X_train = data.drop(columns=['class', 'label'])  # Features (drop the 'class' and 'label' columns)
y_train = data['label']  # Target (the 'label' column)

X_validation = validation_data.drop(columns=['class', 'label'])  # Features for validation
y_validation = validation_data['label']  # Target for validation

# Split the training dataset into training and test sets (optional, if you still want a test set)
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_scaled)
X_test_scaled = scaler.transform(X_test_scaled)
X_validation_scaled = scaler.transform(X_validation)  # Standardize validation data

# Train the XGBoost model
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=4, random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# Step 1: SHAP values to interpret the model
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_train_scaled)

# Step 2: Identify top 3 features contributing to each class using SHAP values
class_names = [0, 1, 2, 3]  # Use numeric labels for classes
top_channels_per_class = {}

for i, class_name in enumerate(class_names):
    class_shap_values = shap_values[i]
    class_shap_values_sorted = np.argsort(np.abs(class_shap_values).mean(axis=0))[::-1]  # Sort by average magnitude
    top_channels_per_class[class_name] = class_shap_values_sorted[:3]

    # Print top channels for each class
    top_channels = X_train.columns[top_channels_per_class[class_name]].values
    print(f"Top 3 Channels for class {class_name}: {top_channels}")

# Step 3: Masking Features and Re-evaluating Model Performance
def mask_features(X, top_channels):
    """ Remove top channels from the features """
    X_masked = X.drop(columns=top_channels)
    return X_masked

# For each class, remove the top 3 features and re-train the model
for class_name in class_names:
    # Mask the top 3 features identified for this class
    top_channels = X_train.columns[top_channels_per_class[class_name]].values
    X_train_masked = mask_features(X_train_scaled, top_channels)
    X_test_masked = mask_features(X_test_scaled, top_channels)
    X_validation_masked = mask_features(X_validation_scaled, top_channels)

    # Train the model again without the top channels
    xgb_model.fit(X_train_masked, y_train)
    
    # Evaluate the model performance on the test set
    y_pred_test = xgb_model.predict(X_test_masked)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    print(f"Accuracy after masking top 3 features for class '{class_name}' on Test Set: {accuracy_test}")
    
    # Evaluate the model performance on the validation set
    y_pred_validation = xgb_model.predict(X_validation_masked)
    accuracy_validation = accuracy_score(y_validation, y_pred_validation)
    print(f"Accuracy after masking top 3 features for class '{class_name}' on Validation Set: {accuracy_validation}")


  from .autonotebook import tqdm as notebook_tqdm


Missing values in 'label' in train data: 13433
Missing values in 'label' in validation data: 13433


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- channel


In [2]:
# Ensure that the validation data has the same columns as the training data (align columns)
X_validation = X_validation[X_train.columns]  # Align the validation data columns with the training data columns

# Standardize the validation data
X_validation_scaled = scaler.transform(X_validation)  # Now this should match the training data

# Train the XGBoost model
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=4, random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# Proceed with SHAP, masking, and evaluation...


In [3]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 9],
    'n_estimators': [100, 200, 300]
}

# Initialize the XGBClassifier
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=4, random_state=42)

# Perform GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")


Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Best score: 1.0


In [6]:
import pandas as pd
import numpy as np
import shap
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load the training dataset
train_data = pd.read_csv('output/standardized_features.csv')

# Map class labels to numeric values (0, 1, 2, 3)
class_mapping = {
    'Normal': 0,
    'Complex Partial Seizures': 1,
    'Electrographic Seizures': 2,
    'Video detected Seizures with no visual change over EEG': 3
}

# Apply the mapping to the 'class' column and create a new 'label' column
train_data['label'] = train_data['class'].map(class_mapping)

# Drop rows with missing target values
train_data = train_data.dropna(subset=['label'])

# Now the target variable is 'label' and the rest are features
X_train = train_data.drop(columns=['class', 'label'])  # Features (drop the 'class' and 'label' columns)
y_train = train_data['label']  # Target (the 'label' column)

# Load the validation dataset
validation_data = pd.read_csv('output/extracted_features_validation.csv')

# Apply the same class mapping to the validation set
validation_data['label'] = validation_data['class'].map(class_mapping)

# Drop rows with missing target values in the validation set
validation_data = validation_data.dropna(subset=['label'])

# Features and target in the validation dataset
X_validation = validation_data.drop(columns=['class', 'label'])
y_validation = validation_data['label']

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_validation_scaled = scaler.transform(X_validation)

# Train the XGBoost model
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=4, random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# Step 1: SHAP values to interpret the model
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_train_scaled)

# Step 2: Identify top 3 features contributing to each class using SHAP values
class_names = [0, 1, 2, 3]  # Use numeric labels for classes
top_channels_per_class = {}

for i, class_name in enumerate(class_names):
    class_shap_values = shap_values[i]
    class_shap_values_sorted = np.argsort(np.abs(class_shap_values).mean(axis=0))[::-1]  # Sort by average magnitude
    top_channels_per_class[class_name] = class_shap_values_sorted[:3]

    # Print top channels for each class
    top_channels = X_train.columns[top_channels_per_class[class_name]].values
    print(f"Top 3 Channels for class {class_name}: {top_channels}")

# Step 3: Masking Features and Re-evaluating Model Performance
def mask_features(X, top_channels):
    """ Remove top channels from the features """
    X_masked = X.drop(columns=top_channels)
    return X_masked

# For each class, remove the top 3 features and re-train the model
for class_name in class_names:
    # Mask the top 3 features identified for this class
    top_channels = X_train.columns[top_channels_per_class[class_name]].values
    X_train_masked = mask_features(X_train, top_channels)
    X_validation_masked = mask_features(X_validation, top_channels)

    # Train the model again without the top channels
    xgb_model.fit(X_train_masked, y_train)
    
    # Evaluate the model performance on the validation set
    y_pred = xgb_model.predict(X_validation_masked)
    accuracy = accuracy_score(y_validation, y_pred)
    print(f"Accuracy after masking top 3 features for class '{class_name}' on validation set: {accuracy}")


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- channel
- file


In [7]:
import pandas as pd
import numpy as np
import shap
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('output/extracted_features.csv')

# Load the validation dataset
validation_data = pd.read_csv('output/extracted_features_validation.csv')  # Your separate validation data CSV

# Map class labels to numeric values (0, 1, 2, 3)
class_mapping = {
    'Normal': 0,
    'Complex Partial Seizures': 1,
    'Electrographic Seizures': 2,
    'Video detected Seizures with no visual change over EEG': 3
}

# Apply the mapping to the 'class' column and create a new 'label' column for the training data
data['label'] = data['class'].map(class_mapping)

# Apply the mapping to the 'class' column and create a new 'label' column for the validation data
validation_data['label'] = validation_data['class'].map(class_mapping)

# Check for missing values in the 'label' column
print(f"Missing values in 'label' in training data: {data['label'].isnull().sum()}")
print(f"Missing values in 'label' in validation data: {validation_data['label'].isnull().sum()}")

# Drop rows with missing target values in both datasets
data = data.dropna(subset=['label'])
validation_data = validation_data.dropna(subset=['label'])

# Now the target variable is 'label' and the rest are features
X_train = data.drop(columns=['class', 'label'])  # Features for training (drop the 'class' and 'label' columns)
y_train = data['label']  # Target for training (the 'label' column)

X_validation = validation_data.drop(columns=['class', 'label'])  # Features for validation (drop the 'class' and 'label' columns)
y_validation = validation_data['label']  # Target for validation (the 'label' column)

# Drop the 'channel' and 'file' columns from both the training and validation sets
X_train = X_train.drop(columns=['channel', 'file'], errors='ignore')  # Use errors='ignore' to avoid errors if the column is missing
X_validation = X_validation.drop(columns=['channel', 'file'], errors='ignore')

# Ensure both datasets have the same columns (align the features)
X_train, X_validation = X_train.align(X_validation, join='inner', axis=1)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_validation_scaled = scaler.transform(X_validation)  # Transform validation set

# Train the XGBoost model
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=4, random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# Step 1: SHAP values to interpret the model
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_train_scaled)

# Step 2: Identify top 3 features contributing to each class using SHAP values
class_names = [0, 1, 2, 3]  # Use numeric labels for classes
top_channels_per_class = {}

for i, class_name in enumerate(class_names):
    class_shap_values = shap_values[i]
    class_shap_values_sorted = np.argsort(np.abs(class_shap_values).mean(axis=0))[::-1]  # Sort by average magnitude
    top_channels_per_class[class_name] = class_shap_values_sorted[:3]

    # Print top channels for each class
    top_channels = X_train.columns[top_channels_per_class[class_name]].values
    print(f"Top 3 Channels for class {class_name}: {top_channels}")

# Step 3: Masking Features and Re-evaluating Model Performance
def mask_features(X, top_channels):
    """ Remove top channels from the features """
    X_masked = X.drop(columns=top_channels)
    return X_masked

# For each class, remove the top 3 features and re-train the model
for class_name in class_names:
    # Mask the top 3 features identified for this class
    top_channels = X_train.columns[top_channels_per_class[class_name]].values
    X_train_masked = mask_features(X_train, top_channels)
    X_validation_masked = mask_features(X_validation, top_channels)

    # Train the model again without the top channels
    xgb_model.fit(X_train_masked, y_train)
    
    # Evaluate the model performance on the validation set
    y_validation_pred = xgb_model.predict(X_validation_masked)
    validation_accuracy = accuracy_score(y_validation, y_validation_pred)
    print(f"Validation Accuracy after masking top 3 features for class '{class_name}': {validation_accuracy}")


Missing values in 'label' in training data: 53675
Missing values in 'label' in validation data: 13433
Top 3 Channels for class 0: ['mean_freq' 'fft_std' 'fft_mean']
Top 3 Channels for class 1: ['mean_freq' 'fft_std' 'fft_mean']
Top 3 Channels for class 2: ['mean_freq' 'fft_std' 'fft_mean']
Top 3 Channels for class 3: ['mean_freq' 'fft_std' 'fft_mean']
Validation Accuracy after masking top 3 features for class '0': 1.0
Validation Accuracy after masking top 3 features for class '1': 1.0
Validation Accuracy after masking top 3 features for class '2': 1.0
Validation Accuracy after masking top 3 features for class '3': 1.0


In [8]:
import pandas as pd
import numpy as np
import shap
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('output/standardized_features.csv')

# Load the validation dataset
validation_data = pd.read_csv('output/standardized_features_validation.csv')  # Your separate validation data CSV

# Map class labels to numeric values (0, 1, 2, 3)
class_mapping = {
    'Normal': 0,
    'Complex Partial Seizures': 1,
    'Electrographic Seizures': 2,
    'Video detected Seizures with no visual change over EEG': 3
}

# Apply the mapping to the 'class' column and create a new 'label' column for the training data
data['label'] = data['class'].map(class_mapping)

# Apply the mapping to the 'class' column and create a new 'label' column for the validation data
validation_data['label'] = validation_data['class'].map(class_mapping)

# Check for missing values in the 'label' column
print(f"Missing values in 'label' in training data: {data['label'].isnull().sum()}")
print(f"Missing values in 'label' in validation data: {validation_data['label'].isnull().sum()}")

# Drop rows with missing target values in both datasets
data = data.dropna(subset=['label'])
validation_data = validation_data.dropna(subset=['label'])

# Now the target variable is 'label' and the rest are features
X_train = data.drop(columns=['class', 'label'])  # Features for training (drop the 'class' and 'label' columns)
y_train = data['label']  # Target for training (the 'label' column)

X_validation = validation_data.drop(columns=['class', 'label'])  # Features for validation (drop the 'class' and 'label' columns)
y_validation = validation_data['label']  # Target for validation (the 'label' column)

# Drop the 'file' column from both the training and validation sets (keep 'channel')
X_train = X_train.drop(columns=['file'], errors='ignore')  # Use errors='ignore' to avoid errors if the column is missing
X_validation = X_validation.drop(columns=['file'], errors='ignore')  # Use errors='ignore' to avoid errors if the column is missing

# Ensure both datasets have the same columns (align the features)
X_train, X_validation = X_train.align(X_validation, join='inner', axis=1)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_validation_scaled = scaler.transform(X_validation)  # Transform validation set

# Train the XGBoost model
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=4, random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# Step 1: SHAP values to interpret the model
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_train_scaled)

# Step 2: Identify top 3 features contributing to each class using SHAP values
class_names = [0, 1, 2, 3]  # Use numeric labels for classes
top_channels_per_class = {}

for i, class_name in enumerate(class_names):
    class_shap_values = shap_values[i]
    class_shap_values_sorted = np.argsort(np.abs(class_shap_values).mean(axis=0))[::-1]  # Sort by average magnitude
    top_channels_per_class[class_name] = class_shap_values_sorted[:3]

    # Print top channels for each class
    top_channels = X_train.columns[top_channels_per_class[class_name]].values
    print(f"Top 3 Channels for class {class_name}: {top_channels}")

# Step 3: Masking Features and Re-evaluating Model Performance
def mask_features(X, top_channels):
    """ Remove top channels from the features """
    X_masked = X.drop(columns=top_channels)
    return X_masked

# For each class, remove the top 3 features and re-train the model
for class_name in class_names:
    # Mask the top 3 features identified for this class
    top_channels = X_train.columns[top_channels_per_class[class_name]].values
    X_train_masked = mask_features(X_train, top_channels)
    X_validation_masked = mask_features(X_validation, top_channels)

    # Train the model again without the top channels
    xgb_model.fit(X_train_masked, y_train)
    
    # Evaluate the model performance on the validation set
    y_validation_pred = xgb_model.predict(X_validation_masked)
    validation_accuracy = accuracy_score(y_validation, y_validation_pred)
    print(f"Validation Accuracy after masking top 3 features for class '{class_name}': {validation_accuracy}")


Missing values in 'label' in training data: 13433
Missing values in 'label' in validation data: 13433
Top 3 Channels for class 0: ['mean_freq' 'fft_std' 'fft_mean']
Top 3 Channels for class 1: ['mean_freq' 'fft_std' 'fft_mean']
Top 3 Channels for class 2: ['mean_freq' 'fft_std' 'fft_mean']
Top 3 Channels for class 3: ['mean_freq' 'fft_std' 'fft_mean']
Validation Accuracy after masking top 3 features for class '0': 1.0
Validation Accuracy after masking top 3 features for class '1': 1.0
Validation Accuracy after masking top 3 features for class '2': 1.0
Validation Accuracy after masking top 3 features for class '3': 1.0
