Synthetic Minority Over-sampling Technique "SMOTE" for Original Data


1. SMOTE

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
# requires: pip install imbalanced-learn
from imblearn.over_sampling import SMOTE 
from imblearn.pipeline import Pipeline

df = pd.read_csv('Final_NetRouteData.csv') 

# Pre-processing
# Drop non-numeric identifiers and empty columns
cols_to_drop = ['Route_ID', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15']
df_clean = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

# One-hot encode categorical data (Algorithm_Used)
df_encoded = pd.get_dummies(df_clean, columns=['Algorithm_Used'], drop_first=True)

# Define Features and Target
X = df_encoded.drop('Optimal', axis=1)
y = df_encoded['Optimal']

# Split data into Training and Testing sets
# We split BEFORE SMOTE to ensure the test set remains "real" data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a Pipeline
# This applies SMOTE only to the training data, then trains the model
model_pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# 6. Train the SMOTE model
model_pipeline.fit(X_train, y_train)

# 7. Predictions
y_pred = model_pipeline.predict(X_test)
y_prob = model_pipeline.predict_proba(X_test)[:, 1]

# 8. Evaluation Metrics
print("--- SMOTE + Random Forest Performance ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, y_prob):.4f}")
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred))

--- SMOTE + Random Forest Performance ---
Accuracy: 1.0000
AUC-ROC: 1.0000

Detailed Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        98
           1       1.00      1.00      1.00       100

    accuracy                           1.00       198
   macro avg       1.00      1.00      1.00       198
weighted avg       1.00      1.00      1.00       198



2. Adaptive Synthetic Sampling "ADASYN"

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from imblearn.over_sampling import ADASYN 
from imblearn.pipeline import Pipeline

df = pd.read_csv('Final_NetRouteData.csv') # Use the ORIGINAL imbalanced file

cols_to_drop = ['Route_ID', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15']
df_clean = df.drop(columns=[c for c in cols_to_drop if c in df.columns])
df_encoded = pd.get_dummies(df_clean, columns=['Algorithm_Used'], drop_first=True)

X = df_encoded.drop('Optimal', axis=1)
y = df_encoded['Optimal']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create the ADASYN Pipeline with modified parameters
# We reduce n_neighbors to 3 to make it easier for the algorithm to find valid groups
model_pipeline = Pipeline([
    ('adasyn', ADASYN(random_state=42, n_neighbors=3, sampling_strategy='minority')),
    ('classifier', RandomForestClassifier(random_state=42))
])

try:
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)
    y_prob = model_pipeline.predict_proba(X_test)[:, 1]

    print("--- ADASYN Performance ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"AUC-ROC: {roc_auc_score(y_test, y_prob):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
except ValueError as e:
    print(f"ADASYN Error: {e}")
    print("Suggestion: If your data is already balanced, skip ADASYN and run the classifier directly.")

ADASYN Error: No samples will be generated with the provided ratio settings.
Suggestion: If your data is already balanced, skip ADASYN and run the classifier directly.


3. SMOTE - Edited Nearest Neighbors "ENN"

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
# requires: pip install imbalanced-learn
from imblearn.combine import SMOTEENN 
from imblearn.pipeline import Pipeline

df = pd.read_csv('Final_NetRouteData.csv') 
df_prep = df.drop(columns=['Route_ID'])
df_encoded = pd.get_dummies(df_prep, columns=['Algorithm_Used'], drop_first=True)

# Define Features and Target
X = df_encoded.drop('Optimal', axis=1)
y = df_encoded['Optimal']

# Split data into Training and Testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create the SMOTE-ENN Pipeline
# This applies over-sampling (SMOTE) then under-sampling (ENN) to clean the data
model_pipeline = Pipeline([
    ('smote_enn', SMOTEENN(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train the Model
model_pipeline.fit(X_train, y_train)

# Predictions
y_pred = model_pipeline.predict(X_test)
y_prob = model_pipeline.predict_proba(X_test)[:, 1]

# Evaluation Metrics for your Research Paper
print("--- SMOTE-ENN + Random Forest Performance ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, y_prob):.4f}")
print("\nDetailed Classification Report (includes Precision, Recall, F1, Support):")
print(classification_report(y_test, y_pred))

--- SMOTE-ENN + Random Forest Performance ---
Accuracy: 1.0000
AUC-ROC: 1.0000

Detailed Classification Report (includes Precision, Recall, F1, Support):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        98
           1       1.00      1.00      1.00       100

    accuracy                           1.00       198
   macro avg       1.00      1.00      1.00       198
weighted avg       1.00      1.00      1.00       198



4. SMOTE-Tomek "Hybrid Technique"

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
# requires: pip install imbalanced-learn
from imblearn.combine import SMOTETomek 
from imblearn.pipeline import Pipeline

df = pd.read_csv('Final_NetRouteData.csv') 
df_prep = df.drop(columns=['Route_ID'])
df_encoded = pd.get_dummies(df_prep, columns=['Algorithm_Used'], drop_first=True)

# Define Features and Target
X = df_encoded.drop('Optimal', axis=1)
y = df_encoded['Optimal']

# Split data into Training and Testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create the SMOTE-Tomek Pipeline
# This applies SMOTE then removes Tomek Links to sharpen the boundary
model_pipeline = Pipeline([
    ('smote_tomek', SMOTETomek(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train the Model
model_pipeline.fit(X_train, y_train)

# Predictions
y_pred = model_pipeline.predict(X_test)
y_prob = model_pipeline.predict_proba(X_test)[:, 1]

# Evaluation Metrics
print("--- SMOTE-Tomek + Random Forest Performance ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, y_prob):.4f}")
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred))

--- SMOTE-Tomek + Random Forest Performance ---
Accuracy: 1.0000
AUC-ROC: 1.0000

Detailed Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        98
           1       1.00      1.00      1.00       100

    accuracy                           1.00       198
   macro avg       1.00      1.00      1.00       198
weighted avg       1.00      1.00      1.00       198

