In [1]:
import pandas as pd

data = pd.read_csv("embedded_dataset_roberta.csv")
low = data[data["label"] == 1]
med = data[data["label"] == 2]
high = data[data["label"] == 3]

print(len(low))
print(len(med))
print(len(high))

452
1131
1728


In [3]:
!pip install -U imbalanced-learn


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
from imblearn.over_sampling import SMOTE

# Drop non-numeric columns (like 'Student' and 'Teacher')
data=data.dropna(subset=['label'] + data.columns[:768].tolist())
X = data

# Convert column names to strings
X.columns = X.columns.astype(str)

# Target column
y = data['label']

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create DataFrame from resampled data
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
y_resampled_df = pd.DataFrame(y_resampled, columns=['Label'])

# Combine into one DataFrame
balanced_df = pd.concat([X_resampled_df, y_resampled_df], axis=1)

# Save to Excel
balanced_df.to_excel("Upsampled_roberta.xlsx", index=False)
print("Upsampled Roberta.")

Upsampled Roberta.


In [6]:
import pandas as pd

data = pd.read_excel("Upsampled_roberta.xlsx")
low = data[data["label"] == 1]
med = data[data["label"] == 2]
high = data[data["label"] == 3]

print(len(low))
print(len(med))
print(len(high))

1728
1728
1728


In [7]:
pip install lime

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import pandas as pd
import numpy as np

# Optional: XGBoost
try:
    from xgboost import XGBClassifier
    xgb_enabled = True
except ImportError:
    xgb_enabled = False

# Load data
data_df = pd.read_excel("Upsampled_roberta.xlsx")
target_col = "label"

X_features = data_df.iloc[:, :768].values
y_labels = data_df[target_col]
le = LabelEncoder()
y_encoded_labels = le.fit_transform(y_labels)

# NCA transformation
nca_model = NeighborhoodComponentsAnalysis(n_components=2, random_state=42)
X_nca_transformed = nca_model.fit_transform(X_features, y_encoded_labels)

# Print top contributors
top_features_to_show = 10
nca_components = nca_model.components_

for comp_idx in range(2):
    weights = nca_components[comp_idx]
    top_indices = np.argsort(np.abs(weights))[::-1][:top_features_to_show]
    print(f"\nTop {top_features_to_show} features for NCA Component {comp_idx + 1}:")
    for i in top_indices:
        print(f"Feature {i}: Weight = {weights[i]:.4f}")

# Standardize
scaler_nca = StandardScaler()
X_scaled_nca = scaler_nca.fit_transform(X_nca_transformed)

# Train/Test split
X_tr, X_te, y_tr, y_te = train_test_split(X_scaled_nca, y_encoded_labels, test_size=0.2, random_state=42)

# Param grids
param_grid_dict = {
    "LogisticRegression": {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['lbfgs', 'liblinear']},
    "SVM": {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    "RandomForest": {'n_estimators': [50, 100, 200], 'max_depth': [None, 5, 10], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 5]},
    "NaiveBayes": {},
    "DecisionTree": {'max_depth': [None, 5, 10], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 5]},
    "AdaBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]},
    "MLP": {'hidden_layer_sizes': [(50,), (100,), (100, 50)], 'activation': ['relu', 'tanh'], 'learning_rate': ['constant', 'adaptive']},
    "KNN": {'n_neighbors': list(range(1, 21)), 'weights': ['uniform', 'distance']}
}

if xgb_enabled:
    param_grid_dict["XGBoost"] = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.3]}

# Define models
model_dict = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "NaiveBayes": GaussianNB(),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "MLP": MLPClassifier(max_iter=500, random_state=42),
    "KNN": KNeighborsClassifier()
}

if xgb_enabled:
    model_dict["XGBoost"] = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Tune models
tuned_models = {}
for model_name, model in model_dict.items():
    print(f"🔧 Tuning {model_name}...")
    if param_grid_dict[model_name]:
        search = GridSearchCV(model, param_grid_dict[model_name], cv=5, scoring='accuracy', n_jobs=-1)
        search.fit(X_tr, y_tr)
        tuned_models[model_name] = search.best_estimator_
        print(f"Best parameters: {search.best_params_}")
    else:
        model.fit(X_tr, y_tr)
        tuned_models[model_name] = model
        print("No hyperparameters to tune.")

# Add stacking
stacking_estimators = [(name.lower(), est) for name, est in tuned_models.items()]
stack_model = StackingClassifier(
    estimators=stacking_estimators,
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    cv=5
)

tuned_models["StackingEnsemble"] = stack_model

# Evaluate
print("\nModel Evaluation:")
for name, model in tuned_models.items():
    model.fit(X_tr, y_tr)
    y_pred_test = model.predict(X_te)
    acc_train = accuracy_score(y_tr, model.predict(X_tr))
    acc_test = accuracy_score(y_te, y_pred_test)

    print(f"\n{name}")
    print(f"Train Accuracy: {acc_train:.4f}")
    print(f"Test Accuracy:  {acc_test:.4f}")
    print("Classification Report:")
    print(classification_report(y_te, y_pred_test, target_names=[str(cls) for cls in le.classes_]))

    print("-" * 60)

# LIME Setup
from lime.lime_tabular import LimeTabularExplainer
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

# Prepare instance
_, X_test_orig, _, y_test_orig = train_test_split(X_features, y_encoded_labels, test_size=0.2, random_state=42)
sample_instance = X_test_orig[0]

lime_explainer = LimeTabularExplainer(
    training_data=X_features,
    feature_names=[f"f_{i}" for i in range(X_features.shape[1])],
    class_names=list(le.classes_),
    mode='classification',
    random_state=42,
    discretize_continuous=False
)

print("\n LIME Explanation (1 sample):")
for name, model in tuned_models.items():
    if name == "StackingEnsemble":
        print(f"{name}: Skipping LIME (not supported directly)")
        continue

    pipeline = make_pipeline(
        StandardScaler(),
        NeighborhoodComponentsAnalysis(n_components=2, random_state=42),
        model
    )
    pipeline.fit(X_features, y_encoded_labels)

    prediction = pipeline.predict([sample_instance])[0]
    pred_class = le.inverse_transform([prediction])[0]
    print(f"{name} ➜ Predicted class: {pred_class}")

    try:
        exp = lime_explainer.explain_instance(
            data_row=sample_instance,
            predict_fn=pipeline.predict_proba,
            num_features=10,
            num_samples=5000
        )
        exp.save_to_file(f"lime_{name}_explanation.html")
        fig = exp.as_pyplot_figure()
        plt.tight_layout()
        fig.savefig(f"lime_{name}_explanation.png")
        plt.close(fig)
    except Exception as ex:
        print(f"Failed LIME for {name}: {ex}")
    print("-" * 60)


Exception ignored in: <function ZipFile.__del__ at 0x0000021D531A4F40>
Traceback (most recent call last):
  File "c:\Users\vinee\AppData\Local\Programs\Python\Python312\Lib\zipfile\__init__.py", line 1940, in __del__
    self.close()
  File "c:\Users\vinee\AppData\Local\Programs\Python\Python312\Lib\zipfile\__init__.py", line 1957, in close
    self.fp.seek(self.start_dir)
ValueError: seek of closed file



Top 10 features for NCA Component 1:
Feature 494: Weight = -967.9686
Feature 219: Weight = 56.3814
Feature 172: Weight = -56.3231
Feature 393: Weight = 55.8800
Feature 6: Weight = 45.1251
Feature 761: Weight = 43.9169
Feature 349: Weight = 42.3161
Feature 217: Weight = -41.8228
Feature 719: Weight = -41.7443
Feature 187: Weight = 41.3312

Top 10 features for NCA Component 2:
Feature 494: Weight = 327.1384
Feature 219: Weight = -81.2126
Feature 390: Weight = -49.3394
Feature 484: Weight = 45.0037
Feature 407: Weight = 38.0800
Feature 749: Weight = 37.2259
Feature 168: Weight = -36.4121
Feature 335: Weight = 35.6499
Feature 519: Weight = -34.7903
Feature 346: Weight = 34.7473
🔧 Tuning LogisticRegression...
Best parameters: {'C': 0.1, 'solver': 'lbfgs'}
🔧 Tuning SVM...
Best parameters: {'C': 10, 'kernel': 'rbf'}
🔧 Tuning RandomForest...
Best parameters: {'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 100}
🔧 Tuning NaiveBayes...
No hyperparameters to tune.



Best parameters: {'learning_rate': 0.1, 'n_estimators': 200}
🔧 Tuning MLP...
Best parameters: {'activation': 'tanh', 'hidden_layer_sizes': (100, 50), 'learning_rate': 'constant'}
🔧 Tuning KNN...
Best parameters: {'n_neighbors': 17, 'weights': 'uniform'}
🔧 Tuning XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}

Model Evaluation:

LogisticRegression
Train Accuracy: 0.8143
Test Accuracy:  0.8014
Classification Report:
              precision    recall  f1-score   support

           1       0.82      0.95      0.88       331
           2       0.79      0.75      0.77       344
           3       0.79      0.72      0.75       362

    accuracy                           0.80      1037
   macro avg       0.80      0.80      0.80      1037
weighted avg       0.80      0.80      0.80      1037

------------------------------------------------------------

SVM
Train Accuracy: 0.8377
Test Accuracy:  0.8168
Classification Report:
              precision    recall  f1-score   support

           1       0.87      0.94      0.91       331
           2       0.77      0.79      0.78       344
           3       0.81      0.73      0.77       362

    accuracy                           0.82      1037
   macro avg       0.82   




AdaBoost
Train Accuracy: 0.8150
Test Accuracy:  0.7956
Classification Report:
              precision    recall  f1-score   support

           1       0.83      0.95      0.88       331
           2       0.77      0.76      0.76       344
           3       0.78      0.69      0.73       362

    accuracy                           0.80      1037
   macro avg       0.79      0.80      0.79      1037
weighted avg       0.79      0.80      0.79      1037

------------------------------------------------------------

MLP
Train Accuracy: 0.8399
Test Accuracy:  0.8274
Classification Report:
              precision    recall  f1-score   support

           1       0.89      0.95      0.92       331
           2       0.79      0.79      0.79       344
           3       0.81      0.75      0.78       362

    accuracy                           0.83      1037
   macro avg       0.83      0.83      0.83      1037
weighted avg       0.83      0.83      0.83      1037

------------------------

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost
Train Accuracy: 0.8469
Test Accuracy:  0.8158
Classification Report:
              precision    recall  f1-score   support

           1       0.88      0.94      0.91       331
           2       0.77      0.78      0.77       344
           3       0.80      0.74      0.77       362

    accuracy                           0.82      1037
   macro avg       0.82      0.82      0.82      1037
weighted avg       0.81      0.82      0.81      1037

------------------------------------------------------------


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



StackingEnsemble
Train Accuracy: 0.8449
Test Accuracy:  0.8255
Classification Report:
              precision    recall  f1-score   support

           1       0.89      0.94      0.91       331
           2       0.78      0.80      0.79       344
           3       0.81      0.75      0.78       362

    accuracy                           0.83      1037
   macro avg       0.83      0.83      0.83      1037
weighted avg       0.82      0.83      0.82      1037

------------------------------------------------------------

 LIME Explanation (1 sample):
LogisticRegression ➜ Predicted class: 2
------------------------------------------------------------
SVM ➜ Predicted class: 2
------------------------------------------------------------
RandomForest ➜ Predicted class: 2
------------------------------------------------------------
NaiveBayes ➜ Predicted class: 2
------------------------------------------------------------
DecisionTree ➜ Predicted class: 2
-------------------------------



AdaBoost ➜ Predicted class: 2
------------------------------------------------------------
MLP ➜ Predicted class: 2
------------------------------------------------------------
KNN ➜ Predicted class: 2
------------------------------------------------------------


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost ➜ Predicted class: 2
------------------------------------------------------------
StackingEnsemble: Skipping LIME (not supported directly)
