In [1]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from data_download import *
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from utils import *
import warnings
from adacost import AdaCost
warnings. filterwarnings('ignore')

# Step 1: Generate a synthetic imbalanced dataset
X, y = download_data(filename='wine')
# Convert to DataFrame for easier manipulation
df = pd.DataFrame(X)
df['target'] = y

# Display the imbalance
print("Initial class distribution:")
print(df['target'].value_counts())

# Step 2: Apply Random Under-sampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)

# Convert the results back to a DataFrame
df_rus = pd.DataFrame(X_rus)
df_rus['target'] = y_rus

# Step 3: Apply Random Over-sampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)

# Convert the results back to a DataFrame
df_ros = pd.DataFrame(X_ros)
df_ros['target'] = y_ros

# Print the class distributions after resampling
print("\nClass distribution after under-sampling:")
print(df_rus['target'].value_counts())

print("\nClass distribution after over-sampling:")
print(df_ros['target'].value_counts())
print(f"Y_Ros: {y_ros.shape}, Y_Rus: {y_rus.shape}")

Initial class distribution:
target
2    71
1    59
3    48
Name: count, dtype: int64

Class distribution after under-sampling:
target
1    48
2    48
3    48
Name: count, dtype: int64

Class distribution after over-sampling:
target
1    71
2    71
3    71
Name: count, dtype: int64
Y_Ros: (213, 1), Y_Rus: (144, 1)


In [2]:
X_train_ros,X_test_ros,y_train_ros,y_test_ros  = train_test_split(X_ros,y_ros,test_size=0.2,random_state=42)
X_train_rus,X_test_rus,y_train_rus,y_test_rus  = train_test_split(X_rus,y_rus,test_size=0.2,random_state=42)
print(f"{X_train_ros.shape},{y_train_ros.shape},{X_train_rus.shape},{y_train_rus.shape}")

(170, 13),(170, 1),(115, 13),(115, 1)


In [3]:
adaclassifier_ros = AdaBoostClassifier(random_state=42)
grboostclassifier_ros = GradientBoostingClassifier(random_state=42)
adaclassifier_ros.fit(X_train_ros,y_train_ros)
grboostclassifier_ros.fit(X_train_ros,y_train_ros)
y_pred_adaboost_ros = adaclassifier_ros.predict(X_test_ros)
y_pred_grboost_ros = grboostclassifier_ros.predict(X_test_ros)



In [4]:
accuracy_adaboost_ros = accuracy_score(y_test_ros, y_pred_adaboost_ros)
precision_adaboost_ros = precision_score(y_test_ros, y_pred_adaboost_ros, average='weighted')
recall_adaboost_ros = recall_score(y_test_ros, y_pred_adaboost_ros, average='weighted')
f1_adaboost_ros = f1_score(y_test_ros, y_pred_adaboost_ros, average='weighted')

# Print the metrics for AdaBoost with oversampling dataset
print("AdaBoost Classifier Metrics:")
print("Accuracy:", accuracy_adaboost_ros)
print("Precision:", precision_adaboost_ros)
print("Recall:", recall_adaboost_ros)
print("Weighted F1 Score:", f1_adaboost_ros)
print(classification_report(y_true=y_test_ros,y_pred=y_pred_adaboost_ros))

AdaBoost Classifier Metrics:
Accuracy: 0.9302325581395349
Precision: 0.9425444596443229
Recall: 0.9302325581395349
Weighted F1 Score: 0.930950594791555
              precision    recall  f1-score   support

           1       1.00      0.82      0.90        11
           2       0.82      1.00      0.90        14
           3       1.00      0.94      0.97        18

    accuracy                           0.93        43
   macro avg       0.94      0.92      0.92        43
weighted avg       0.94      0.93      0.93        43



In [5]:
accuracy_grboost_ros = accuracy_score(y_test_ros, y_pred_grboost_ros)
precision_grboost_ros = precision_score(y_test_ros, y_pred_grboost_ros, average='weighted')
recall_grboost_ros = recall_score(y_test_ros, y_pred_grboost_ros, average='weighted')
f1_grboost_ros = f1_score(y_test_ros, y_pred_grboost_ros, average='weighted')

# Print the metrics for AdaBoost
print("AdaBoost Classifier Metrics:")
print("Accuracy:", accuracy_grboost_ros)
print("Precision:", precision_grboost_ros)
print("Recall:", recall_grboost_ros)
print("Weighted F1 Score:", f1_grboost_ros)
print(classification_report(y_true=y_test_ros,y_pred=y_pred_grboost_ros))

AdaBoost Classifier Metrics:
Accuracy: 0.9767441860465116
Precision: 0.9786821705426355
Recall: 0.9767441860465116
Weighted F1 Score: 0.9768190839980526
              precision    recall  f1-score   support

           1       0.92      1.00      0.96        11
           2       1.00      0.93      0.96        14
           3       1.00      1.00      1.00        18

    accuracy                           0.98        43
   macro avg       0.97      0.98      0.97        43
weighted avg       0.98      0.98      0.98        43



In [6]:
adaclassifier_rus = AdaBoostClassifier(random_state=42)
grboostclassifier_rus = GradientBoostingClassifier(random_state=42)
adaclassifier_rus.fit(X_train_rus,y_train_rus)
grboostclassifier_rus.fit(X_train_rus,y_train_rus)
y_pred_adaboost_rus = adaclassifier_ros.predict(X_test_rus)
y_pred_grboost_rus = grboostclassifier_ros.predict(X_test_rus)

In [7]:
accuracy_adaboost_rus = accuracy_score(y_test_rus, y_pred_adaboost_rus)
precision_adaboost_rus = precision_score(y_test_rus, y_pred_adaboost_rus, average='weighted')
recall_adaboost_rus = recall_score(y_test_rus, y_pred_adaboost_rus, average='weighted')
f1_adaboost_rus = f1_score(y_test_rus, y_pred_adaboost_rus, average='weighted')

# Print the metrics for AdaBoost
print("AdaBoost Classifier Metrics wit RUS:")
print("Accuracy:", accuracy_adaboost_rus)
print("Precision:", precision_adaboost_rus)
print("Recall:", recall_adaboost_rus)
print("Weighted F1 Score:", f1_adaboost_rus)
print(classification_report(y_true=y_test_rus,y_pred=y_pred_adaboost_rus))

AdaBoost Classifier Metrics wit RUS:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
Weighted F1 Score: 1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11
           3       1.00      1.00      1.00         9

    accuracy                           1.00        29
   macro avg       1.00      1.00      1.00        29
weighted avg       1.00      1.00      1.00        29



In [8]:
accuracy_grboost_rus = accuracy_score(y_test_rus, y_pred_grboost_rus)
precision_grboost_rus = precision_score(y_test_rus, y_pred_grboost_rus, average='weighted')
recall_grboost_rus = recall_score(y_test_rus, y_pred_grboost_rus, average='weighted')
f1_grboost_rus = f1_score(y_test_rus, y_pred_grboost_rus, average='weighted')

# Print the metrics for AdaBoost
print("GradientBoost Classifier Metrics with RUS:")
print("Accuracy:", accuracy_grboost_rus)
print("Precision:", precision_grboost_rus)
print("Recall:", recall_grboost_rus)
print("Weighted F1 Score:", f1_grboost_rus)
print(classification_report(y_true=y_test_rus,y_pred=y_pred_grboost_rus))

GradientBoost Classifier Metrics with RUS:
Accuracy: 0.9655172413793104
Precision: 0.9689655172413794
Recall: 0.9655172413793104
Weighted F1 Score: 0.9656036643332468
              precision    recall  f1-score   support

           1       0.90      1.00      0.95         9
           2       1.00      0.91      0.95        11
           3       1.00      1.00      1.00         9

    accuracy                           0.97        29
   macro avg       0.97      0.97      0.97        29
weighted avg       0.97      0.97      0.97        29



In [9]:
mean_average_precision(y_test_ros,adaclassifier_ros.predict_proba(X_test_ros),num_classes=3)

0.20818377201733637

In [10]:
from sklearn.model_selection import GridSearchCV
ada = AdaCost(algorithm = "SAMME.R",random_state = 100)
cv = GridSearchCV(ada,
                  param_grid = {'learning_rate':[0.01,0.05,0.1,0.25,0.5,1],'n_estimators':[10,20,50,100,200]},
                  verbose = 1, 
                  n_jobs = 1)
cv.fit(X_train_ros,y_train_ros)
cv.best_params_,cv.best_score_
ada_cv = cv.best_estimator_
print(cv.best_estimator_)
y_pred = ada_cv.predict(X_test_ros)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
AdaCost(learning_rate=0.01, n_estimators=100, random_state=100)


In [11]:
accuracy_acost_ros = accuracy_score(y_test_ros, y_pred)
precision_acost_ros = precision_score(y_test_ros, y_pred, average='weighted')
recall_acost_ros = recall_score(y_test_ros, y_pred, average='weighted')
f1_acost_ros = f1_score(y_test_ros, y_pred, average='weighted')
print("AdaCost Classifier Metrics with RUS:")
print("Accuracy:", accuracy_acost_ros)
print("Precision:", precision_acost_ros)
print("Recall:", recall_grboost_ros)
print("Weighted F1 Score:", f1_grboost_ros)
print(classification_report(y_true=y_test_ros,y_pred=y_pred))

AdaCost Classifier Metrics with RUS:
Accuracy: 0.8372093023255814
Precision: 0.8758669930640555
Recall: 0.9767441860465116
Weighted F1 Score: 0.9768190839980526
              precision    recall  f1-score   support

           1       0.92      1.00      0.96        11
           2       0.68      0.93      0.79        14
           3       1.00      0.67      0.80        18

    accuracy                           0.84        43
   macro avg       0.87      0.87      0.85        43
weighted avg       0.88      0.84      0.84        43



In [12]:
from sklearn.model_selection import GridSearchCV
ada_rus = AdaCost(algorithm = "SAMME.R",random_state = 100)
cv = GridSearchCV(ada_rus,
                  param_grid = {'learning_rate':[0.01,0.05,0.1,0.25,0.5,1],'n_estimators':[10,20,50,100,200]},
                  verbose = 1, 
                  n_jobs = 1)
cv.fit(X_train_rus,y_train_rus)
cv.best_params_,cv.best_score_
ada_cv = cv.best_estimator_
print(cv.best_estimator_)
y_pred_acost_rus = ada_cv.predict(X_test_rus)
accuracy_acost_rus = accuracy_score(y_test_rus, y_pred_acost_rus)
precision_acost_rus = precision_score(y_test_rus, y_pred_acost_rus, average='weighted')
recall_acost_rus = recall_score(y_test_rus, y_pred_acost_rus, average='weighted')
f1_acost_ros = f1_score(y_test_rus, y_pred_acost_rus, average='weighted')
print("AdaCost Classifier Metrics with RUS:")
print("Accuracy:", accuracy_acost_rus)
print("Precision:", precision_acost_rus)
print("Recall:", recall_grboost_rus)
print("Weighted F1 Score:", f1_grboost_rus)
print(classification_report(y_true=y_test_rus,y_pred=y_pred_acost_rus))

Fitting 5 folds for each of 30 candidates, totalling 150 fits
AdaCost(learning_rate=0.05, n_estimators=100, random_state=100)
AdaCost Classifier Metrics with RUS:
Accuracy: 0.896551724137931
Precision: 0.9057471264367816
Recall: 0.9655172413793104
Weighted F1 Score: 0.9656036643332468
              precision    recall  f1-score   support

           1       0.90      1.00      0.95         9
           2       0.83      0.91      0.87        11
           3       1.00      0.78      0.88         9

    accuracy                           0.90        29
   macro avg       0.91      0.90      0.90        29
weighted avg       0.91      0.90      0.90        29



In [13]:
# Example usage

k = 2  # Consider the top-2 predictions

# Calculate MAP@K
mapk_score = mapk(y_test_rus.to_numpy().T[0,:], ada_cv.predict_proba(X_test_rus), k)
print(f"MAP@K: {mapk_score:.4f}")

MAP@K: 0.2414
