In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# ==============================================================
# 1Ô∏è‚É£ Load the dataset
# ==============================================================
df = pd.read_csv("students_adaptability_level_online_education.csv")

print("=== Dataset Overview ===")
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
print()



=== Dataset Overview ===
Rows: 1205, Columns: 14



#### Basic Preprocessing

In [4]:
# ==============================================================
# 2Ô∏è‚É£ Check for missing values
# ==============================================================
print("=== Missing Values Check ===")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0] if missing_values.sum() > 0 else "‚úÖ No missing values found.")
print()

# ==============================================================
# 3Ô∏è‚É£ Check for duplicates (do NOT remove them)
# ==============================================================
duplicates_count = df.duplicated().sum()
print(f"=== Duplicate Rows Check ===\nTotal Duplicates Found: {duplicates_count}")
if duplicates_count > 0:
    print("‚ö†Ô∏è Duplicates exist but will NOT be removed (kept for representation).")
print()

# ==============================================================
# 4Ô∏è‚É£ Outlier Detection (for numeric columns only)
# ==============================================================
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

if numeric_cols:
    print("=== Outlier Detection (IQR method) ===")
    for col in numeric_cols:
        Q1, Q3 = df[col].quantile([0.25, 0.75])
        IQR = Q3 - Q1
        lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        outliers = df[(df[col] < lower) | (df[col] > upper)]
        print(f"{col}: {len(outliers)} outliers")
else:
    print("‚úÖ No numeric columns available for outlier detection.")
print()

# ==============================================================
# 5Ô∏è‚É£ Hybrid Encoding: Ordinal + Label Encoding
# ==============================================================

df_encoded = df.copy()

# --- Define ordinal features with meaningful order ---
ordinal_mappings = {
    'Age': ['1-5', '6-10', '11-15', '16-20', '21-25', '26-30'],
    'Education Level': ['School', 'College', 'University'],
    'Load-shedding': ['Low', 'High'],
    'Financial Condition': ['Poor', 'Mid', 'Rich'],
    'Network Type': ['2G', '3G', '4G'],
    'Class Duration': ['0', '1-3', '3-6'],
    'Adaptivity Level': ['Low', 'Moderate', 'High']  # Target variable
}

ordinal_features = list(ordinal_mappings.keys())
nominal_features = [col for col in df.columns if col not in ordinal_features]

# --- Apply Ordinal Encoding ---
ordinal_encoder = OrdinalEncoder(categories=[ordinal_mappings[col] for col in ordinal_features])
df_encoded[ordinal_features] = ordinal_encoder.fit_transform(df_encoded[ordinal_features])

# --- Apply Label Encoding for Nominal Features ---
label_encoders = {}
encoding_mappings = {}

for col in nominal_features:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le
    encoding_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))

# --- Record Ordinal mappings ---
for col in ordinal_mappings:
    encoding_mappings[col] = {cat: i for i, cat in enumerate(ordinal_mappings[col])}

print("‚úÖ Hybrid Encoding Complete (Ordinal + Label Encoding).\n")

# Show encoding interpretation
print("=== Encoding Mappings for Each Feature ===")
for col, mapping in encoding_mappings.items():
    print(f"\n{col}:")
    for k, v in mapping.items():
        print(f"  {k} ‚Üí {v}")

# ==============================================================
# 6Ô∏è‚É£ Split into Training and Testing Sets
# ==============================================================
X = df_encoded.drop(columns=['Adaptivity Level'])
y = df_encoded['Adaptivity Level']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\n‚úÖ Data split into training and test sets.")
print(f"Training set: {X_train.shape[0]} rows")
print(f"Testing set: {X_test.shape[0]} rows\n")

# ==============================================================
# 7Ô∏è‚É£ Check class imbalance (training data only)
# ==============================================================
print("=== Class Distribution (Before SMOTE) ===")
print(y_train.value_counts())


# ==============================================================
# 8Ô∏è‚É£ Apply SMOTE only on training data
# ==============================================================
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("=== Class Distribution (After SMOTE) ===")
print(pd.Series(y_train_resampled).value_counts())


# ==============================================================
# 9Ô∏è‚É£ (No Standardization applied)
# ==============================================================
print("\n‚ÑπÔ∏è Skipping standardization: categorical-encoded values should retain meaning for EDA & model interpretation.")

# ==============================================================
# üîü Final Checks
# ==============================================================
print("\nFinal Shapes:")
print(f"Training features: {X_train_resampled.shape}")
print(f"Testing features:  {X_test.shape}")

print("\n‚úÖ Preprocessing completed successfully with Hybrid Encoding.")


=== Missing Values Check ===
‚úÖ No missing values found.

=== Duplicate Rows Check ===
Total Duplicates Found: 949
‚ö†Ô∏è Duplicates exist but will NOT be removed (kept for representation).

‚úÖ No numeric columns available for outlier detection.

‚úÖ Hybrid Encoding Complete (Ordinal + Label Encoding).

=== Encoding Mappings for Each Feature ===

Gender:
  Boy ‚Üí 0
  Girl ‚Üí 1

Institution Type:
  Government ‚Üí 0
  Non Government ‚Üí 1

IT Student:
  No ‚Üí 0
  Yes ‚Üí 1

Location:
  No ‚Üí 0
  Yes ‚Üí 1

Internet Type:
  Mobile Data ‚Üí 0
  Wifi ‚Üí 1

Self Lms:
  No ‚Üí 0
  Yes ‚Üí 1

Device:
  Computer ‚Üí 0
  Mobile ‚Üí 1
  Tab ‚Üí 2

Age:
  1-5 ‚Üí 0
  6-10 ‚Üí 1
  11-15 ‚Üí 2
  16-20 ‚Üí 3
  21-25 ‚Üí 4
  26-30 ‚Üí 5

Education Level:
  School ‚Üí 0
  College ‚Üí 1
  University ‚Üí 2

Load-shedding:
  Low ‚Üí 0
  High ‚Üí 1

Financial Condition:
  Poor ‚Üí 0
  Mid ‚Üí 1
  Rich ‚Üí 2

Network Type:
  2G ‚Üí 0
  3G ‚Üí 1
  4G ‚Üí 2

Class Duration:
  0 ‚Üí 0
  1-3 ‚Üí 1
  3-6 

#### Advanced Analysis

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, confusion_matrix

# Function to build and evaluate a model
def build_and_evaluate_nn(model_name, model):
    print(f"\n===== {model_name} =====")
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    history = model.fit(X_train_resampled, y_train_resampled, epochs=50, batch_size=16, verbose=0, validation_split=0.2)

    # Evaluation
    y_pred = model.predict(X_test)
    y_pred_classes = y_pred.argmax(axis=1)
    print(classification_report(y_test, y_pred_classes))
    return history

# 1Ô∏è‚É£ Simple MLP
model1 = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_resampled.shape[1],)),
    Dense(3, activation='softmax')
])
build_and_evaluate_nn("Simple MLP", model1)

# 2Ô∏è‚É£ Deep MLP
model2 = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_resampled.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])
build_and_evaluate_nn("Deep MLP", model2)

# 3Ô∏è‚É£ Regularized MLP (Dropout + L2)
from tensorflow.keras import regularizers
model3 = Sequential([
    Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.001), input_shape=(X_train_resampled.shape[1],)),
    Dropout(0.4),
    Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    Dropout(0.3),
    Dense(3, activation='softmax')
])
build_and_evaluate_nn("Regularized MLP", model3)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



===== Simple MLP =====
[1m8/8[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 9ms/step 
              precision    recall  f1-score   support

         0.0       0.73      0.77      0.75        96
         1.0       0.76      0.72      0.74       125
         2.0       0.52      0.55      0.54        20

    accuracy                           0.73       241
   macro avg       0.67      0.68      0.67       241
weighted avg       0.73      0.73      0.73       241


===== Deep MLP =====


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 8ms/step 
              precision    recall  f1-score   support

         0.0       0.93      0.94      0.93        96
         1.0       0.96      0.87      0.91       125
         2.0       0.67      1.00      0.80        20

    accuracy                           0.91       241
   macro avg       0.85      0.94      0.88       241
weighted avg       0.92      0.91      0.91       241


===== Regularized MLP =====


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 8ms/step 
              precision    recall  f1-score   support

         0.0       0.75      0.92      0.82        96
         1.0       0.89      0.70      0.78       125
         2.0       0.64      0.80      0.71        20

    accuracy                           0.79       241
   macro avg       0.76      0.80      0.77       241
weighted avg       0.81      0.79      0.79       241



<keras.src.callbacks.history.History at 0x7920381c55b0>

##### Support Vector Classifier

In [5]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Linear SVM
svm_linear = SVC(kernel='linear')
svm_linear.fit(X_train_resampled, y_train_resampled)
y_pred_linear = svm_linear.predict(X_test)
print("\n===== Linear SVM =====")
print(classification_report(y_test, y_pred_linear))

# RBF SVM
svm_rbf = SVC(kernel='rbf', gamma='scale')
svm_rbf.fit(X_train_resampled, y_train_resampled)
y_pred_rbf = svm_rbf.predict(X_test)
print("\n===== RBF SVM =====")
print(classification_report(y_test, y_pred_rbf))



===== Linear SVM =====
              precision    recall  f1-score   support

         0.0       0.64      0.78      0.70        96
         1.0       0.75      0.44      0.56       125
         2.0       0.20      0.50      0.29        20

    accuracy                           0.58       241
   macro avg       0.53      0.57      0.51       241
weighted avg       0.66      0.58      0.59       241


===== RBF SVM =====
              precision    recall  f1-score   support

         0.0       0.69      0.80      0.74        96
         1.0       0.82      0.58      0.68       125
         2.0       0.36      0.75      0.48        20

    accuracy                           0.68       241
   macro avg       0.62      0.71      0.63       241
weighted avg       0.73      0.68      0.69       241



Improve the Existing SVMs

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1, 'scale', 'auto'],
    'kernel': ['rbf']
}

grid = GridSearchCV(SVC(), param_grid, refit=True, cv=5, verbose=1)
grid.fit(X_train_resampled, y_train_resampled)

print("Best Parameters:", grid.best_params_)
print("Best CV Accuracy:", grid.best_score_)

# Evaluate on test set
best_svm = grid.best_estimator_
y_pred_best = best_svm.predict(X_test)

from sklearn.metrics import classification_report
print("\n===== Tuned RBF SVM =====")
print(classification_report(y_test, y_pred_best))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'C': 100, 'gamma': 1, 'kernel': 'rbf'}
Best CV Accuracy: 0.9033333333333333

===== Tuned RBF SVM =====
              precision    recall  f1-score   support

         0.0       0.93      0.94      0.93        96
         1.0       0.94      0.87      0.90       125
         2.0       0.64      0.90      0.75        20

    accuracy                           0.90       241
   macro avg       0.84      0.90      0.86       241
weighted avg       0.91      0.90      0.90       241



Try Different SVM Variants

In [None]:
#Polynomial Kernel SVM

svm_poly = SVC(kernel='poly', degree=3, C=1)
svm_poly.fit(X_train_resampled, y_train_resampled)
y_pred_poly = svm_poly.predict(X_test)

print("\n===== Polynomial SVM =====")
print(classification_report(y_test, y_pred_poly))



===== Polynomial SVM =====
              precision    recall  f1-score   support

         0.0       0.71      0.85      0.77        96
         1.0       0.83      0.64      0.72       125
         2.0       0.45      0.65      0.53        20

    accuracy                           0.73       241
   macro avg       0.66      0.71      0.68       241
weighted avg       0.75      0.73      0.73       241



In [None]:
#Sigmoid Kernel SVM

svm_sigmoid = SVC(kernel='sigmoid', C=1)
svm_sigmoid.fit(X_train_resampled, y_train_resampled)
y_pred_sigmoid = svm_sigmoid.predict(X_test)

print("\n===== Sigmoid SVM =====")
print(classification_report(y_test, y_pred_sigmoid))



===== Sigmoid SVM =====
              precision    recall  f1-score   support

         0.0       0.36      0.47      0.41        96
         1.0       0.71      0.04      0.08       125
         2.0       0.06      0.35      0.11        20

    accuracy                           0.24       241
   macro avg       0.38      0.29      0.20       241
weighted avg       0.52      0.24      0.21       241



In [None]:
#LinearSVC (Faster Linear Version)

from sklearn.svm import LinearSVC

linear_svm_fast = LinearSVC(C=1, max_iter=10000)
linear_svm_fast.fit(X_train_resampled, y_train_resampled)
y_pred_fast = linear_svm_fast.predict(X_test)

print("\n===== LinearSVC (fast) =====")
print(classification_report(y_test, y_pred_fast))



===== LinearSVC (fast) =====
              precision    recall  f1-score   support

         0.0       0.65      0.76      0.70        96
         1.0       0.70      0.43      0.53       125
         2.0       0.19      0.50      0.28        20

    accuracy                           0.57       241
   macro avg       0.52      0.56      0.50       241
weighted avg       0.64      0.57      0.58       241

