In [2]:
pip install lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m266.2/275.7 kB[0m [31m9.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=7b33867cc20b40af6201c47267c1cc901b59b4ff74ce9297e5186b59e0724649
  Stored in directory: /root/.cache/pip/wheels/e7/5d/0e/4b4fff9a47468fed5633211fb3b76d1db43fe806a17fb7486a
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from lime.lime_tabular import LimeTabularExplainer
from lime.explanation import Explanation # Used for type hinting

#  A1. Stacking Classifier Implementation
#base_models a list of base learners (like decision tree,svm,knn)
#fianl estimator is meta model that learns the predictions of the base model
def create_stacking_classifier(base_models: list, final_estimator_choice) -> StackingClassifier:
    #stackingclassifier is an esemble method
    stacking_clf = StackingClassifier(
        estimators=base_models,
        final_estimator=final_estimator_choice,
        cv=5,  #using 5 fold cross validation for base models
        n_jobs=-1 #this tells scikit learn to use all available cores for parallel learning
    )
    return stacking_clf

# A2. Pipeline Implementation

def create_and_train_pipeline(classifier_model: StackingClassifier, X_train: pd.DataFrame, y_train: np.ndarray) -> Pipeline:
    #x_train is your training features
    #y_train is your training labels
    pipeline_steps = [
        ('scaler', StandardScaler()),#applying standardization
        ('stacking_clf', classifier_model)
    ]
    model_pipeline = Pipeline(steps=pipeline_steps)#pipeline ensures first it does scalling then passes the data into the classifier
    model_pipeline.fit(X_train, y_train)
    return model_pipeline

#  A3. LIME Explainer Implementation (RECTIFIED: Added top_labels)

def explain_pipeline_prediction_lime(
    trained_pipeline: Pipeline,
    X_train: pd.DataFrame,#training feature matrix
    X_test_sample: np.ndarray,#a simgle sample(row) from the test set
    feature_names: list,#feature names
    class_names: list,#name of the classes
) -> Explanation:

    # Create the LIME Tabular Explainer.
    explainer = LimeTabularExplainer(#builds a local surrogate model(usallly linear regression)
        training_data=X_train.values,
        feature_names=feature_names,
        class_names=class_names,
        mode='classification',#tells LIME that this is a classification problem(not regression)
        random_state=42,
    )

    # Function to returns class probabilities for each batch from pipeline
    predict_fn = lambda x: trained_pipeline.predict_proba(x)

    # Generate the explanation for the chosen test sample
    explanation = explainer.explain_instance(
        data_row=X_test_sample,
        predict_fn=predict_fn,#the probability function defined above
        num_features=10, #shoes the top 10 important features for prediction
        # Forces LIME to generate explanations for all classes
        top_labels=len(class_names)
    )

    return explanation


if __name__ == '__main__':
    # loading  the dataset
    FILE_NAME = "/content/features_filtered.csv"

    try:
        # attempts to load the dataset into a pandad DataFrame
        df = pd.read_csv(FILE_NAME)
    except FileNotFoundError:
        df = pd.read_csv(f"/content/{FILE_NAME}")

    # remove the not useful features for classification
    X = df.drop(columns=['file', 'run', 'onset_s', 'label'])
    y_raw = df['label'] #the target label column


    X_cleaned = X.apply(pd.to_numeric, errors='coerce')#coerce is used if the value is invalid it will be replaced with NaN
    valid_indices = X_cleaned.dropna().index
    X = X_cleaned.loc[valid_indices]
    y_raw = y_raw.loc[valid_indices]


    # Encode the categorical target variable to integers
    label_encoder = LabelEncoder() #convert categorical into numerical(integer labels)
    y = label_encoder.fit_transform(y_raw)
    class_names = label_encoder.classes_.tolist()

    #extract the names of all features
    feature_names = X.columns.tolist()

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    ) #startify ensures the class distribution is preserved

    print("Data Preparation Complete ")
    print(f"Total Samples (after cleaning): {len(X)}")
    print(f"Number of Features: {X.shape[1]}")
    print(f"Classes: {class_names}\n")

    # defining Base Models and Final Estimator (A1)
    base_classifiers = [
        ('knn', KNeighborsClassifier(n_neighbors=5)),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('nb', GaussianNB())
    ]
    # NOTE: multi_class='multinomial' is the correct setting for multi-class logistic regression
    meta_model = LogisticRegression(solver='saga', max_iter=2000, multi_class='multinomial', random_state=42, n_jobs=-1)
#logistic regression is used as meta model for stacking because it can learns weights for combining base predictions
    #  Calling Stacking Classifier (A1)
    stacking_model = create_stacking_classifier(base_classifiers, meta_model)

    # reate and Train Pipeline (A2)
    full_pipeline = create_and_train_pipeline(stacking_model, X_train, y_train) #it builds a pipeline

    # Evaluating the pipeline on test data
    y_pred = full_pipeline.predict(X_test)
    pipeline_accuracy = accuracy_score(y_test, y_pred)
    print("Pipeline Training and Evaluation (A2)")
    print(f"Test Set Accuracy: {pipeline_accuracy:.4f}\n")

    # explain Pipeline Prediction using LIME (A3)
    sample_to_explain_idx = 10 #selects a single test sample
    X_explain_sample = X_test.iloc[sample_to_explain_idx].values #extracting feature value
    y_true_label = class_names[y_test[sample_to_explain_idx]]

    # creating a LIME Tabular Explainer
    lime_explanation = explain_pipeline_prediction_lime(
        full_pipeline, X_train, X_explain_sample, feature_names, class_names
    )

    # Get the predicted class and explaining predictions for chosen test sample
    prediction_proba = full_pipeline.predict_proba(X_explain_sample.reshape(1, -1))[0]#gives probabilities
    predicted_class_idx = np.argmax(prediction_proba)#finding top probabilities
    predicted_label = class_names[predicted_class_idx]#maps the index back to original name
    predicted_probability = prediction_proba[predicted_class_idx] #probability score for the predicted class.

    print("LIME Explanation for Pipeline Outcome (A3)")
    print(f"Explaining Sample Index: {sample_to_explain_idx}")
    print(f"True Label: {y_true_label}, Predicted Label: {predicted_label}")
    print(f"Predicted Probability for '{predicted_label}': {predicted_probability:.4f}\n")


    idx_to_explain = predicted_class_idx
    label_to_explain = predicted_label

    print(f"Top 10 Features Contributing to Prediction: '{label_to_explain}' (index {idx_to_explain})")


    # lime returns a list of feature,weights
    try:
        for feature, weight in lime_explanation.as_list(label=idx_to_explain):
            print(f"  {feature}: {weight:+.4f}")
    except KeyError as e:
        print(f"ERROR: Failed to retrieve explanation for index {idx_to_explain} due to LIME internal error: {e}")
        available_labels = lime_explanation.available_labels()
        if available_labels:
            # Fallback to display the top available explanation if the predicted one is missing
            fallback_idx = available_labels[0]
            fallback_label = class_names[fallback_idx]
            print(f"Displaying explanation for top available class: '{fallback_label}' (index {fallback_idx})")
            for feature, weight in lime_explanation.as_list(label=fallback_idx):
                 print(f"  {feature}: {weight:+.4f}")
        else:
            print("No explanation available for any class.")


Data Preparation Complete 
Total Samples (after cleaning): 39347
Number of Features: 256
Classes: ['both_feet', 'both_fists', 'left', 'rest', 'right']





--- Pipeline Training and Evaluation (A2) ---
Test Set Accuracy: 0.5413





--- LIME Explanation for Pipeline Outcome (A3) ---
Explaining Sample Index: 10
True Label: both_fists, Predicted Label: both_feet
Predicted Probability for 'both_feet': 0.4446

Top 10 Features Contributing to Prediction: 'both_feet' (index 0)
  O2..__theta > 0.00: +0.0090
  0.00 < Fpz.__beta <= 0.00: +0.0074
  Cp1.__delta > 0.00: +0.0072
  Tp7.__theta > 0.00: +0.0067
  Pz..__delta > 0.00: +0.0067
  Fc4.__theta > 0.00: +0.0065
  0.00 < Po4.__beta <= 0.00: +0.0061
  T10.__theta > 0.00: +0.0057
  0.00 < Afz.__alpha <= 0.00: -0.0057
  P3..__delta > 0.00: +0.0048


