Synthetic Minority Over-sampling Technique "SMOTE" for Original Data


1. SMOTE

In [7]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

df = pd.read_csv('NetRouteData.csv')


cols_to_drop = ['Route_ID', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15']
df_clean = df.drop(columns=cols_to_drop)

# Encoding: Convert categorical 'Algorithm_Used' to numeric
df_encoded = pd.get_dummies(df_clean, columns=['Algorithm_Used'], drop_first=True)

# Define Features (X) and Target (y)
X = df_encoded.drop('Optimal', axis=1)
y = df_encoded['Optimal']

print(f"Original class distribution: {dict(y.value_counts())}")

# Apply SMOTE
# random_state ensures reproducibility for your research paper
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

print(f"Resampled class distribution: {dict(pd.Series(y_res).value_counts())}")

# Save the balanced dataset for your next steps
df_smote = pd.concat([pd.DataFrame(X_res, columns=X.columns), pd.Series(y_res, name='Optimal')], axis=1)
df_smote.to_csv('NetRouteData_SMOTE_Balanced.csv', index=False)

Original class distribution: {0: np.int64(490), 1: np.int64(10)}
Resampled class distribution: {1: np.int64(490), 0: np.int64(490)}


2. Adaptive Synthetic Sampling "ADASYN"

In [8]:
import pandas as pd
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import train_test_split

df = pd.read_csv('NetRouteData.csv')

cols_to_drop = ['Route_ID', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15']
df_clean = df.drop(columns=cols_to_drop)

# One-hot encode the 'Algorithm_Used' column
df_encoded = pd.get_dummies(df_clean, columns=['Algorithm_Used'], drop_first=True)

# Define Features (X) and Target (y)
X = df_encoded.drop('Optimal', axis=1)
y = df_encoded['Optimal']

print(f"Original class distribution: {dict(y.value_counts())}")

# Apply ADASYN
# random_state ensures your research results can be replicated
adasyn = ADASYN(random_state=42)
X_res, y_res = adasyn.fit_resample(X, y)

print(f"Resampled class distribution (ADASYN): {dict(pd.Series(y_res).value_counts())}")

# 5. Save the balanced dataset
df_adasyn = pd.concat([pd.DataFrame(X_res, columns=X.columns), pd.Series(y_res, name='Optimal')], axis=1)
df_adasyn.to_csv('NetRouteData_ADASYN_Balanced.csv', index=False)

print("ADASYN balanced dataset saved as 'NetRouteData_ADASYN_Balanced.csv'")

Original class distribution: {0: np.int64(490), 1: np.int64(10)}
Resampled class distribution (ADASYN): {1: np.int64(490), 0: np.int64(490)}
ADASYN balanced dataset saved as 'NetRouteData_ADASYN_Balanced.csv'


3. SMOTE - Edited Nearest Neighbors "ENN"

In [9]:
import pandas as pd
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split

df = pd.read_csv('NetRouteData.csv')

cols_to_drop = ['Route_ID', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15']
df_clean = df.drop(columns=cols_to_drop)

# One-hot encode categorical features
df_encoded = pd.get_dummies(df_clean, columns=['Algorithm_Used'], drop_first=True)

# Define Features (X) and Target (y)
X = df_encoded.drop('Optimal', axis=1)
y = df_encoded['Optimal']

print(f"Original class distribution: {dict(y.value_counts())}")

# Apply SMOTE-ENN (Hybrid Sampling)
# random_state ensures reproducibility
smote_enn = SMOTEENN(random_state=42)
X_res, y_res = smote_enn.fit_resample(X, y)

print(f"Resampled class distribution (SMOTE-ENN): {dict(pd.Series(y_res).value_counts())}")

# Save the hybrid-balanced dataset
df_smote_enn = pd.concat([pd.DataFrame(X_res, columns=X.columns), pd.Series(y_res, name='Optimal')], axis=1)
df_smote_enn.to_csv('NetRouteData_SMOTEENN_Balanced.csv', index=False)

print("SMOTE-ENN balanced dataset saved as 'NetRouteData_SMOTEENN_Balanced.csv'")

Original class distribution: {0: np.int64(490), 1: np.int64(10)}
Resampled class distribution (SMOTE-ENN): {1: np.int64(480), 0: np.int64(373)}
SMOTE-ENN balanced dataset saved as 'NetRouteData_SMOTEENN_Balanced.csv'


4. SMOTE-Tomek "Hybrid Technique"

In [10]:
import pandas as pd
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split

df = pd.read_csv('NetRouteData.csv')

cols_to_drop = ['Route_ID', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15']
df_clean = df.drop(columns=cols_to_drop)

df_encoded = pd.get_dummies(df_clean, columns=['Algorithm_Used'], drop_first=True)

# Define Features (X) and Target (y)
X = df_encoded.drop('Optimal', axis=1)
y = df_encoded['Optimal']

print(f"Original class distribution: {dict(y.value_counts())}")

# Apply SMOTE-Tomek
# random_state is critical for research consistency
smote_tomek = SMOTETomek(random_state=42)
X_res, y_res = smote_tomek.fit_resample(X, y)

print(f"Resampled class distribution (SMOTE-Tomek): {dict(pd.Series(y_res).value_counts())}")

# Save the final balanced dataset
df_smote_tomek = pd.concat([pd.DataFrame(X_res, columns=X.columns), pd.Series(y_res, name='Optimal')], axis=1)
df_smote_tomek.to_csv('NetRouteData_SMOTETomek_Balanced.csv', index=False)

print("SMOTE-Tomek balanced dataset saved as 'NetRouteData_SMOTETomek_Balanced.csv'")

Original class distribution: {0: np.int64(490), 1: np.int64(10)}
Resampled class distribution (SMOTE-Tomek): {1: np.int64(490), 0: np.int64(490)}
SMOTE-Tomek balanced dataset saved as 'NetRouteData_SMOTETomek_Balanced.csv'
