In [1]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from data_download import *
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from utils import *
import warnings
from adacost import AdaCost
from ucimlrepo import fetch_ucirepo

warnings. filterwarnings('ignore')

# Step 1: Generate a synthetic imbalanced dataset
data = fetch_ucirepo(id=30)
X,y = data.data.features, data.data.targets
# Convert to DataFrame for easier manipulation
df = pd.DataFrame(X)
df['target'] = y

# Display the imbalance
print("Initial class distribution:")
print(df['target'].value_counts())

# Step 2: Apply Random Under-sampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)

# Convert the results back to a DataFrame
df_rus = pd.DataFrame(X_rus)
df_rus['target'] = y_rus

# Step 3: Apply Random Over-sampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)

# Convert the results back to a DataFrame
df_ros = pd.DataFrame(X_ros)
df_ros['target'] = y_ros

# Print the class distributions after resampling
print("\nClass distribution after under-sampling:")
print(df_rus['target'].value_counts())

print("\nClass distribution after over-sampling:")
print(df_ros['target'].value_counts())
print(f"Y_Ros: {y_ros.shape}, Y_Rus: {y_rus.shape}")

Initial class distribution:
target
1    629
3    511
2    333
Name: count, dtype: int64

Class distribution after under-sampling:
target
1    333
2    333
3    333
Name: count, dtype: int64

Class distribution after over-sampling:
target
1    629
2    629
3    629
Name: count, dtype: int64
Y_Ros: (1887, 1), Y_Rus: (999, 1)


In [2]:
X_train_ros,X_test_ros,y_train_ros,y_test_ros  = train_test_split(X_ros,y_ros,test_size=0.2,random_state=42)
X_train_rus,X_test_rus,y_train_rus,y_test_rus  = train_test_split(X_rus,y_rus,test_size=0.2,random_state=42)
print(f"{X_train_ros.shape},{y_train_ros.shape},{X_train_rus.shape},{y_train_rus.shape}")

(1509, 9),(1509, 1),(799, 9),(799, 1)


In [3]:
adaclassifier_ros = AdaBoostClassifier(random_state=42)
grboostclassifier_ros = GradientBoostingClassifier(random_state=42)
adaclassifier_ros.fit(X_train_ros,y_train_ros)
grboostclassifier_ros.fit(X_train_ros,y_train_ros)
y_pred_adaboost_ros = adaclassifier_ros.predict(X_test_ros)
y_pred_grboost_ros = grboostclassifier_ros.predict(X_test_ros)



In [4]:
accuracy_adaboost_ros = accuracy_score(y_test_ros, y_pred_adaboost_ros)
precision_adaboost_ros = precision_score(y_test_ros, y_pred_adaboost_ros, average='weighted')
recall_adaboost_ros = recall_score(y_test_ros, y_pred_adaboost_ros, average='weighted')
f1_adaboost_ros = f1_score(y_test_ros, y_pred_adaboost_ros, average='weighted')

# Print the metrics for AdaBoost with oversampling dataset
print("AdaBoost Classifier Metrics:")
print("Accuracy:", accuracy_adaboost_ros)
print("Precision:", precision_adaboost_ros)
print("Recall:", recall_adaboost_ros)
print("Weighted F1 Score:", f1_adaboost_ros)
print(classification_report(y_true=y_test_ros,y_pred=y_pred_adaboost_ros))

AdaBoost Classifier Metrics:
Accuracy: 0.5502645502645502
Precision: 0.5558788795759174
Recall: 0.5502645502645502
Weighted F1 Score: 0.5516795246554543
              precision    recall  f1-score   support

           1       0.61      0.59      0.60       115
           2       0.58      0.52      0.55       141
           3       0.47      0.54      0.50       122

    accuracy                           0.55       378
   macro avg       0.56      0.55      0.55       378
weighted avg       0.56      0.55      0.55       378



In [5]:
accuracy_grboost_ros = accuracy_score(y_test_ros, y_pred_grboost_ros)
precision_grboost_ros = precision_score(y_test_ros, y_pred_grboost_ros, average='weighted')
recall_grboost_ros = recall_score(y_test_ros, y_pred_grboost_ros, average='weighted')
f1_grboost_ros = f1_score(y_test_ros, y_pred_grboost_ros, average='weighted')

# Print the metrics for AdaBoost
print("AdaBoost Classifier Metrics:")
print("Accuracy:", accuracy_grboost_ros)
print("Precision:", precision_grboost_ros)
print("Recall:", recall_grboost_ros)
print("Weighted F1 Score:", f1_grboost_ros)
print(classification_report(y_true=y_test_ros,y_pred=y_pred_grboost_ros))

AdaBoost Classifier Metrics:
Accuracy: 0.6190476190476191
Precision: 0.6300747418944698
Recall: 0.6190476190476191
Weighted F1 Score: 0.621287425915189
              precision    recall  f1-score   support

           1       0.64      0.62      0.63       115
           2       0.71      0.60      0.65       141
           3       0.53      0.64      0.58       122

    accuracy                           0.62       378
   macro avg       0.63      0.62      0.62       378
weighted avg       0.63      0.62      0.62       378



In [6]:
adaclassifier_rus = AdaBoostClassifier(random_state=42)
grboostclassifier_rus = GradientBoostingClassifier(random_state=42)
adaclassifier_rus.fit(X_train_rus,y_train_rus)
grboostclassifier_rus.fit(X_train_rus,y_train_rus)
y_pred_adaboost_rus = adaclassifier_ros.predict(X_test_rus)
y_pred_grboost_rus = grboostclassifier_ros.predict(X_test_rus)

In [7]:
accuracy_adaboost_rus = accuracy_score(y_test_rus, y_pred_adaboost_rus)
precision_adaboost_rus = precision_score(y_test_rus, y_pred_adaboost_rus, average='weighted')
recall_adaboost_rus = recall_score(y_test_rus, y_pred_adaboost_rus, average='weighted')
f1_adaboost_rus = f1_score(y_test_rus, y_pred_adaboost_rus, average='weighted')

# Print the metrics for AdaBoost
print("AdaBoost Classifier Metrics wit RUS:")
print("Accuracy:", accuracy_adaboost_rus)
print("Precision:", precision_adaboost_rus)
print("Recall:", recall_adaboost_rus)
print("Weighted F1 Score:", f1_adaboost_rus)
print(classification_report(y_true=y_test_rus,y_pred=y_pred_adaboost_rus))

AdaBoost Classifier Metrics wit RUS:
Accuracy: 0.55
Precision: 0.5577659266409265
Recall: 0.55
Weighted F1 Score: 0.5514550480506369
              precision    recall  f1-score   support

           1       0.66      0.55      0.60        67
           2       0.53      0.60      0.56        65
           3       0.49      0.50      0.49        68

    accuracy                           0.55       200
   macro avg       0.56      0.55      0.55       200
weighted avg       0.56      0.55      0.55       200



In [8]:
accuracy_grboost_rus = accuracy_score(y_test_rus, y_pred_grboost_rus)
precision_grboost_rus = precision_score(y_test_rus, y_pred_grboost_rus, average='weighted')
recall_grboost_rus = recall_score(y_test_rus, y_pred_grboost_rus, average='weighted')
f1_grboost_rus = f1_score(y_test_rus, y_pred_grboost_rus, average='weighted')

# Print the metrics for AdaBoost
print("GradientBoost Classifier Metrics with RUS:")
print("Accuracy:", accuracy_grboost_rus)
print("Precision:", precision_grboost_rus)
print("Recall:", recall_grboost_rus)
print("Weighted F1 Score:", f1_grboost_rus)
print(classification_report(y_true=y_test_rus,y_pred=y_pred_grboost_rus))

GradientBoost Classifier Metrics with RUS:
Accuracy: 0.59
Precision: 0.6002123689727463
Recall: 0.59
Weighted F1 Score: 0.5912865133649805
              precision    recall  f1-score   support

           1       0.70      0.55      0.62        67
           2       0.61      0.68      0.64        65
           3       0.49      0.54      0.52        68

    accuracy                           0.59       200
   macro avg       0.60      0.59      0.59       200
weighted avg       0.60      0.59      0.59       200



In [9]:
mean_average_precision(y_test_ros,adaclassifier_ros.predict_proba(X_test_ros),num_classes=3)

0.17562720252593492

In [10]:
from sklearn.model_selection import GridSearchCV
ada = AdaCost(algorithm = "SAMME.R",random_state = 100)
cv = GridSearchCV(ada,
                  param_grid = {'learning_rate':[0.01,0.05,0.1,0.25,0.5,1],'n_estimators':[10,20,50,100,200]},
                  verbose = 1, 
                  n_jobs = 1)
cv.fit(X_train_ros,y_train_ros)
cv.best_params_,cv.best_score_
ada_cv = cv.best_estimator_
print(cv.best_estimator_)
y_pred = ada_cv.predict(X_test_ros)


Fitting 5 folds for each of 30 candidates, totalling 150 fits


AdaCost(learning_rate=0.01, random_state=100)


In [11]:
accuracy_acost_ros = accuracy_score(y_test_ros, y_pred)
precision_acost_ros = precision_score(y_test_ros, y_pred, average='weighted')
recall_acost_ros = recall_score(y_test_ros, y_pred, average='weighted')
f1_acost_ros = f1_score(y_test_ros, y_pred, average='weighted')
print("AdaCost Classifier Metrics with RUS:")
print("Accuracy:", accuracy_acost_ros)
print("Precision:", precision_acost_ros)
print("Recall:", recall_acost_ros)
print("Weighted F1 Score:", f1_acost_ros)
print(classification_report(y_true=y_test_ros,y_pred=y_pred))

AdaCost Classifier Metrics with RUS:
Accuracy: 0.46825396825396826
Precision: 0.33721538235427123
Recall: 0.46825396825396826
Weighted F1 Score: 0.3811501054450836
              precision    recall  f1-score   support

           1       0.40      0.82      0.54       115
           2       0.58      0.59      0.58       141
           3       0.00      0.00      0.00       122

    accuracy                           0.47       378
   macro avg       0.33      0.47      0.37       378
weighted avg       0.34      0.47      0.38       378



In [12]:
from sklearn.model_selection import GridSearchCV
ada_rus = AdaCost(algorithm = "SAMME.R",random_state = 100)
cv = GridSearchCV(ada_rus,
                  param_grid = {'learning_rate':[0.01,0.05,0.1,0.25,0.5,1],'n_estimators':[10,20,50,100,200]},
                  verbose = 1, 
                  n_jobs = 1)
cv.fit(X_train_rus,y_train_rus)
cv.best_params_,cv.best_score_
ada_cv = cv.best_estimator_
print(cv.best_estimator_)
y_pred_acost_rus = ada_cv.predict(X_test_rus)
accuracy_acost_rus = accuracy_score(y_test_rus, y_pred_acost_rus)
precision_acost_rus = precision_score(y_test_rus, y_pred_acost_rus, average='weighted')
recall_acost_rus = recall_score(y_test_rus, y_pred_acost_rus, average='weighted')
f1_acost_ros = f1_score(y_test_rus, y_pred_acost_rus, average='weighted')
print("AdaCost Classifier Metrics with RUS:")
print("Accuracy:", accuracy_acost_rus)
print("Precision:", precision_acost_rus)
print("Recall:", recall_grboost_rus)
print("Weighted F1 Score:", f1_grboost_rus)
print(classification_report(y_true=y_test_rus,y_pred=y_pred_acost_rus))

Fitting 5 folds for each of 30 candidates, totalling 150 fits
AdaCost(learning_rate=0.01, random_state=100)
AdaCost Classifier Metrics with RUS:
Accuracy: 0.475
Precision: 0.3134958495849585
Recall: 0.59
Weighted F1 Score: 0.5912865133649805
              precision    recall  f1-score   support

           1       0.47      0.70      0.57        67
           2       0.48      0.74      0.58        65
           3       0.00      0.00      0.00        68

    accuracy                           0.47       200
   macro avg       0.32      0.48      0.38       200
weighted avg       0.31      0.47      0.38       200



In [13]:
# Example usage

k = 50  # Consider the top-2 predictions

# Calculate MAP@K
mapk_score = mapk(y_test_rus.to_numpy().T[0,:], ada_cv.predict_proba(X_test_rus), 50)
print(f"MAP@K: {mapk_score:.4f}")


MAP@K: 0.2958
