In [1]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn as sk
import imblearn as imb

## US CRIME DATASET

In [10]:
from imblearn.datasets import fetch_datasets
ecoli = fetch_datasets()['us_crime']
ecoli.data.shape

(1994, 100)

In [11]:
X = ecoli.data
y = ecoli.target

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("=== Normal RF ===")
print(classification_report(y_test, y_pred_rf))


=== Normal RF ===
              precision    recall  f1-score   support

          -1       0.95      0.98      0.97       554
           1       0.62      0.40      0.49        45

    accuracy                           0.94       599
   macro avg       0.79      0.69      0.73       599
weighted avg       0.93      0.94      0.93       599


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline

pipeline = Pipeline([
    ('smote', BorderlineSMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42))
])

pipeline.fit(X_train, y_train)
y_pred_smote = pipeline.predict(X_test)

print("=== RF with Borderline-SMOTE ===")
print(classification_report(y_test, y_pred_smote))


=== RF with Borderline-SMOTE ===
              precision    recall  f1-score   support

          -1       0.96      0.96      0.96       554
           1       0.50      0.51      0.51        45

    accuracy                           0.92       599
   macro avg       0.73      0.73      0.73       599
weighted avg       0.93      0.92      0.93       599


In [16]:
from imblearn.over_sampling import SMOTE

pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42))
])
pipeline.fit(X_train, y_train)
y_pred_smote = pipeline.predict(X_test)

print("=== RF with SMOTE ===")
print(classification_report(y_test, y_pred_smote))

=== RF with SMOTE ===
              precision    recall  f1-score   support

          -1       0.96      0.95      0.95       554
           1       0.45      0.53      0.49        45

    accuracy                           0.92       599
   macro avg       0.71      0.74      0.72       599
weighted avg       0.92      0.92      0.92       599


In [18]:

from imblearn.over_sampling import ADASYN

pipeline = Pipeline([
    ('smote', ADASYN(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42))
])
pipeline.fit(X_train, y_train)
y_pred_smote = pipeline.predict(X_test)

print("=== RF with ADASYN ===")
print(classification_report(y_test, y_pred_smote))

=== RF with ADASYN ===
              precision    recall  f1-score   support

          -1       0.96      0.95      0.95       554
           1       0.45      0.56      0.50        45

    accuracy                           0.92       599
   macro avg       0.71      0.75      0.73       599
weighted avg       0.93      0.92      0.92       599


## LETTER_IMG DATASET

In [19]:
limg = fetch_datasets()['letter_img']
limg.data.shape

(20000, 16)

In [22]:
X = limg.data
y = limg.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)

In [23]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("=== Normal RF ===")
print(classification_report(y_test, y_pred_rf))

=== Normal RF ===
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      5780
           1       0.99      0.89      0.94       220

    accuracy                           1.00      6000
   macro avg       0.99      0.95      0.97      6000
weighted avg       1.00      1.00      1.00      6000


In [24]:
pipeline = Pipeline([
    ('smote', BorderlineSMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42))
])

pipeline.fit(X_train, y_train)
y_pred_smote = pipeline.predict(X_test)

print("=== RF with Borderline-SMOTE ===")
print(classification_report(y_test, y_pred_smote))

=== RF with Borderline-SMOTE ===
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      5780
           1       1.00      0.91      0.95       220

    accuracy                           1.00      6000
   macro avg       1.00      0.96      0.98      6000
weighted avg       1.00      1.00      1.00      6000
