In [4]:
import pandas as pd

data = pd.read_excel("mpnet_embeddings.xlsx")
low = data[data["label"] == 1]
med = data[data["label"] == 2]
high = data[data["label"] == 3]

print(len(low))
print(len(med))
print(len(high))

452
1131
1728


In [5]:
!pip install -U imbalanced-learn





[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import pandas as pd
from imblearn.over_sampling import SMOTE

# Drop non-numeric columns (like 'Student' and 'Teacher')
data=data.dropna(subset=['label'] + data.columns[:768].tolist())
X = data

# Convert column names to strings
X.columns = X.columns.astype(str)

# Target column
y = data['label']

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create DataFrame from resampled data
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
y_resampled_df = pd.DataFrame(y_resampled, columns=['Label'])

# Combine into one DataFrame
balanced_df = pd.concat([X_resampled_df, y_resampled_df], axis=1)

# Save to Excel
balanced_df.to_excel("Upsampled_Mpnet.xlsx", index=False)
print("Upsampled Mpnet.")

Upsampled Mpnet.


In [7]:
import pandas as pd

data = pd.read_excel("Upsampled_Mpnet.xlsx")
low = data[data["label"] == 1]
med = data[data["label"] == 2]
high = data[data["label"] == 3]

print(len(low))
print(len(med))
print(len(high))

1728
1728
1728


In [8]:
pip install lime

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import pandas as pd
import numpy as np

# Optional: XGBoost
try:
    from xgboost import XGBClassifier
    xgb_enabled = True
except ImportError:
    xgb_enabled = False

# Load data
data_df = pd.read_excel("Upsampled_Mpnet.xlsx")
target_col = "label"

# Detect if column names are strings or integers
first_col = data_df.columns[0]
if isinstance(first_col, int):
    embedding_columns = list(range(768))  # integer names
else:
    embedding_columns = [str(i) for i in range(768)]  # string names

X_features = data_df[embedding_columns].values
y_labels = data_df[target_col]
le = LabelEncoder()
y_encoded_labels = le.fit_transform(y_labels)

# NCA transformation
nca_model = NeighborhoodComponentsAnalysis(n_components=2, random_state=42)
X_nca_transformed = nca_model.fit_transform(X_features, y_encoded_labels)

# Print top contributors
top_features_to_show = 10
nca_components = nca_model.components_

for comp_idx in range(2):
    weights = nca_components[comp_idx]
    top_indices = np.argsort(np.abs(weights))[::-1][:top_features_to_show]
    print(f"\nTop {top_features_to_show} features for NCA Component {comp_idx + 1}:")
    for i in top_indices:
        print(f"Feature {i}: Weight = {weights[i]:.4f}")

# Standardize
scaler_nca = StandardScaler()
X_scaled_nca = scaler_nca.fit_transform(X_nca_transformed)

# Train/Test split
X_tr, X_te, y_tr, y_te = train_test_split(X_scaled_nca, y_encoded_labels, test_size=0.2, random_state=42)

# Param grids
param_grid_dict = {
    "LogisticRegression": {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['lbfgs', 'liblinear']},
    "SVM": {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    "RandomForest": {'n_estimators': [50, 100, 200], 'max_depth': [None, 5, 10], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 5]},
    "NaiveBayes": {},
    "DecisionTree": {'max_depth': [None, 5, 10], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 5]},
    "AdaBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]},
    "MLP": {'hidden_layer_sizes': [(50,), (100,), (100, 50)], 'activation': ['relu', 'tanh'], 'learning_rate': ['constant', 'adaptive']},
    "KNN": {'n_neighbors': list(range(1, 21)), 'weights': ['uniform', 'distance']}
}

if xgb_enabled:
    param_grid_dict["XGBoost"] = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.3]}

# Define models
model_dict = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "NaiveBayes": GaussianNB(),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "MLP": MLPClassifier(max_iter=500, random_state=42),
    "KNN": KNeighborsClassifier()
}

if xgb_enabled:
    model_dict["XGBoost"] = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Tune models
tuned_models = {}
for model_name, model in model_dict.items():
    print(f"🔧 Tuning {model_name}...")
    if param_grid_dict[model_name]:
        search = GridSearchCV(model, param_grid_dict[model_name], cv=5, scoring='accuracy', n_jobs=-1)
        search.fit(X_tr, y_tr)
        tuned_models[model_name] = search.best_estimator_
        print(f"Best parameters: {search.best_params_}")
    else:
        model.fit(X_tr, y_tr)
        tuned_models[model_name] = model
        print("No hyperparameters to tune.")

# Add stacking
stacking_estimators = [(name.lower(), est) for name, est in tuned_models.items()]
stack_model = StackingClassifier(
    estimators=stacking_estimators,
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    cv=5
)

tuned_models["StackingEnsemble"] = stack_model

# Evaluate
print("\nModel Evaluation:")
for name, model in tuned_models.items():
    model.fit(X_tr, y_tr)
    y_pred_test = model.predict(X_te)
    acc_train = accuracy_score(y_tr, model.predict(X_tr))
    acc_test = accuracy_score(y_te, y_pred_test)

    print(f"\n{name}")
    print(f"Train Accuracy: {acc_train:.4f}")
    print(f"Test Accuracy:  {acc_test:.4f}")
    print("Classification Report:")
    print(classification_report(y_te, y_pred_test, target_names=[str(cls) for cls in le.classes_]))

    print("-" * 60)

# LIME Setup
from lime.lime_tabular import LimeTabularExplainer
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

# Prepare instance
_, X_test_orig, _, y_test_orig = train_test_split(X_features, y_encoded_labels, test_size=0.2, random_state=42)
sample_instance = X_test_orig[0]

lime_explainer = LimeTabularExplainer(
    training_data=X_features,
    feature_names=[f"f_{i}" for i in range(X_features.shape[1])],
    class_names=list(le.classes_),
    mode='classification',
    random_state=42,
    discretize_continuous=False
)

print("\n LIME Explanation (1 sample):")
for name, model in tuned_models.items():
    if name == "StackingEnsemble":
        print(f"{name}: Skipping LIME (not supported directly)")
        continue

    pipeline = make_pipeline(
        StandardScaler(),
        NeighborhoodComponentsAnalysis(n_components=2, random_state=42),
        model
    )
    pipeline.fit(X_features, y_encoded_labels)

    prediction = pipeline.predict([sample_instance])[0]
    pred_class = le.inverse_transform([prediction])[0]
    print(f"{name} ➜ Predicted class: {pred_class}")

    try:
        exp = lime_explainer.explain_instance(
            data_row=sample_instance,
            predict_fn=pipeline.predict_proba,
            num_features=10,
            num_samples=5000
        )
        exp.save_to_file(f"lime_{name}_explanation.html")
        fig = exp.as_pyplot_figure()
        plt.tight_layout()
        fig.savefig(f"lime_{name}_explanation.png")
        plt.close(fig)
    except Exception as ex:
        print(f"Failed LIME for {name}: {ex}")
    print("-" * 60)



Top 10 features for NCA Component 1:
Feature 34: Weight = -18.5722
Feature 749: Weight = 16.2427
Feature 266: Weight = 14.4001
Feature 633: Weight = -14.1081
Feature 425: Weight = -12.8418
Feature 349: Weight = 12.7638
Feature 13: Weight = 12.5606
Feature 122: Weight = 12.2354
Feature 110: Weight = -12.1251
Feature 471: Weight = -12.0630

Top 10 features for NCA Component 2:
Feature 555: Weight = -18.8237
Feature 301: Weight = -16.0748
Feature 597: Weight = 13.4197
Feature 363: Weight = 12.7880
Feature 264: Weight = -12.7670
Feature 753: Weight = 12.5523
Feature 11: Weight = 12.4286
Feature 746: Weight = 11.1177
Feature 86: Weight = 11.0924
Feature 182: Weight = 11.0040
🔧 Tuning LogisticRegression...
Best parameters: {'C': 0.01, 'solver': 'lbfgs'}
🔧 Tuning SVM...
Best parameters: {'C': 10, 'kernel': 'rbf'}
🔧 Tuning RandomForest...
Best parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
🔧 Tuning NaiveBayes...
No hyperparameters to tune.
🔧 



Best parameters: {'learning_rate': 0.1, 'n_estimators': 200}
🔧 Tuning MLP...
Best parameters: {'activation': 'tanh', 'hidden_layer_sizes': (100, 50), 'learning_rate': 'constant'}
🔧 Tuning KNN...
Best parameters: {'n_neighbors': 10, 'weights': 'uniform'}
🔧 Tuning XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best parameters: {'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 50}

Model Evaluation:

LogisticRegression
Train Accuracy: 0.7890
Test Accuracy:  0.7724
Classification Report:
              precision    recall  f1-score   support

           1       0.80      0.97      0.88       331
           2       0.76      0.66      0.71       344
           3       0.75      0.69      0.72       362

    accuracy                           0.77      1037
   macro avg       0.77      0.78      0.77      1037
weighted avg       0.77      0.77      0.77      1037

------------------------------------------------------------

SVM
Train Accuracy: 0.8611
Test Accuracy:  0.8631
Classification Report:
              precision    recall  f1-score   support

           1       0.93      0.97      0.95       331
           2       0.81      0.82      0.82       344
           3       0.85      0.80      0.82       362

    accuracy                           0.86      1037
   macro avg       0.86     




AdaBoost
Train Accuracy: 0.8283
Test Accuracy:  0.8226
Classification Report:
              precision    recall  f1-score   support

           1       0.80      0.97      0.88       331
           2       0.84      0.71      0.77       344
           3       0.84      0.80      0.82       362

    accuracy                           0.82      1037
   macro avg       0.82      0.82      0.82      1037
weighted avg       0.83      0.82      0.82      1037

------------------------------------------------------------

MLP
Train Accuracy: 0.8724
Test Accuracy:  0.8660
Classification Report:
              precision    recall  f1-score   support

           1       0.96      0.97      0.96       331
           2       0.82      0.81      0.81       344
           3       0.82      0.83      0.83       362

    accuracy                           0.87      1037
   macro avg       0.87      0.87      0.87      1037
weighted avg       0.87      0.87      0.87      1037

------------------------

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost
Train Accuracy: 0.8994
Test Accuracy:  0.8727
Classification Report:
              precision    recall  f1-score   support

           1       0.94      0.96      0.95       331
           2       0.83      0.84      0.83       344
           3       0.85      0.82      0.83       362

    accuracy                           0.87      1037
   macro avg       0.87      0.87      0.87      1037
weighted avg       0.87      0.87      0.87      1037

------------------------------------------------------------


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



StackingEnsemble
Train Accuracy: 0.8953
Test Accuracy:  0.8804
Classification Report:
              precision    recall  f1-score   support

           1       0.96      0.97      0.96       331
           2       0.84      0.85      0.84       344
           3       0.85      0.83      0.84       362

    accuracy                           0.88      1037
   macro avg       0.88      0.88      0.88      1037
weighted avg       0.88      0.88      0.88      1037

------------------------------------------------------------

 LIME Explanation (1 sample):
LogisticRegression ➜ Predicted class: 1
------------------------------------------------------------
SVM ➜ Predicted class: 2
------------------------------------------------------------
RandomForest ➜ Predicted class: 2
------------------------------------------------------------
NaiveBayes ➜ Predicted class: 2
------------------------------------------------------------
DecisionTree ➜ Predicted class: 2
-------------------------------



AdaBoost ➜ Predicted class: 1
------------------------------------------------------------
MLP ➜ Predicted class: 2
------------------------------------------------------------
KNN ➜ Predicted class: 2
------------------------------------------------------------


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost ➜ Predicted class: 2
------------------------------------------------------------
StackingEnsemble: Skipping LIME (not supported directly)
