In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_covtype
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from utils import *
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
from adacost import AdaCost


In [2]:
covertype = fetch_covtype() # https://archive.ics.uci.edu/dataset/31/covertype
X = covertype.data
y = covertype.target
print(X.shape,y.shape)

(581012, 54) (581012,)


In [3]:
X_train,X_test,y_train,y_test  = train_test_split(X,y,test_size=0.2,random_state=42)
print(f"{X_train.shape},{y_train.shape},{X_train.shape},{y_train.shape}")

(464809, 54),(464809,),(464809, 54),(464809,)


In [4]:
df = pd.DataFrame(X)
df['target'] = y

# Display the imbalance
print("Initial class distribution:")
print(df['target'].value_counts())

# Step 2: Apply Random Under-sampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)

# Convert the results back to a DataFrame
df_rus = pd.DataFrame(X_rus)
df_rus['target'] = y_rus

# Step 3: Apply Random Over-sampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)
df_ros = pd.DataFrame(X_ros)
df_ros['target'] = y_ros

# Print the class distributions after resampling
print("\nClass distribution after under-sampling:")
print(df_rus['target'].value_counts())

print("\nClass distribution after over-sampling:")
print(df_ros['target'].value_counts())
print(f"Y_Ros: {y_ros.shape}, Y_Rus: {y_rus.shape}")
# Convert the results back to a DataFrame

Initial class distribution:
target
2    283301
1    211840
3     35754
7     20510
6     17367
5      9493
4      2747
Name: count, dtype: int64

Class distribution after under-sampling:
target
1    2747
2    2747
3    2747
4    2747
5    2747
6    2747
7    2747
Name: count, dtype: int64

Class distribution after over-sampling:
target
5    283301
2    283301
1    283301
7    283301
3    283301
6    283301
4    283301
Name: count, dtype: int64
Y_Ros: (1983107,), Y_Rus: (19229,)


In [5]:
X_train_ros,X_test_ros,y_train_ros,y_test_ros  = train_test_split(X_ros,y_ros,test_size=0.2,random_state=42)
X_train_rus,X_test_rus,y_train_rus,y_test_rus  = train_test_split(X_rus,y_rus,test_size=0.2,random_state=42)
print(f"{X_train_ros.shape},{y_train_ros.shape},{X_train_rus.shape},{y_train_rus.shape}")

(1586485, 54),(1586485,),(15383, 54),(15383,)


In [6]:
from sklearn.model_selection import GridSearchCV
ada_ros = AdaCost(algorithm = "SAMME.R",random_state = 100)
cv = GridSearchCV(ada_ros,
                  param_grid = {'learning_rate':[0.01,0.05,0.1,0.25,0.5,1],'n_estimators':[10,20,50,100,200]},
                  verbose = 1, 
                  n_jobs = 1)
cv.fit(X_train_ros,y_train_ros)
cv.best_params_,cv.best_score_
ada_cv_ros = cv.best_estimator_
print(cv.best_estimator_)
y_pred_ros = ada_cv_ros.predict(X_test_ros)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [None]:
accuracy_acost_ros = accuracy_score(y_test_ros, y_pred_ros)
precision_acost_ros = precision_score(y_test_ros, y_pred_ros, average='weighted')
recall_acost_ros = recall_score(y_test_ros, y_pred_ros, average='weighted')
f1_acost_ros = f1_score(y_test_ros, y_pred_ros, average='weighted')
print("AdaCost Classifier Metrics with RUS:")
print("Accuracy:", accuracy_acost_ros)
print("Precision:", precision_acost_ros)
print("Recall:", recall_acost_ros)
print("Weighted F1 Score:", recall_acost_ros)
print(classification_report(y_true=y_test_ros,y_pred=y_pred_ros))

In [None]:
from sklearn.model_selection import GridSearchCV
ada = AdaCost(algorithm = "SAMME.R",random_state = 100)
cv = GridSearchCV(ada,
                  param_grid = {'learning_rate':[0.01,0.05,0.1,0.25,0.5,1],'n_estimators':[10,20,50,100,200]},
                  verbose = 1, 
                  n_jobs = 1)
cv.fit(X_train,y_train)
cv.best_params_,cv.best_score_
ada_cv = cv.best_estimator_
print(cv.best_estimator_)
y_pred = ada_cv.predict(X_test)

In [None]:
accuracy_acost = accuracy_score(y_test, y_pred)
precision_acost = precision_score(y_test, y_pred, average='weighted')
recall_acost = recall_score(y_test, y_pred, average='weighted')
f1_acost = f1_score(y_test, y_pred, average='weighted')
print("AdaCost Classifier Metrics with RUS:")
print("Accuracy:", accuracy_acost)
print("Precision:", precision_acost)
print("Recall:", recall_acost)
print("Weighted F1 Score:", recall_acost)
print(classification_report(y_true=y_test,y_pred=y_pred))