In [1]:
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from sklearn import tree
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [2]:
# Generate the dataset with 2 features to keep it simple
X, y = make_classification(n_samples=5000, n_features=2, n_informative=2,
                          n_redundant=0, weights=[0.9, 0.1], random_state=41)

In [3]:
print("Positive class: ", y.tolist().count(1))
print("Negative class: ", y.tolist().count(0))

Positive class:  517
Negative class:  4483


In [4]:
#Apply random under-sampling
rus = RandomUnderSampler()
X_RUS, y_RUS = rus.fit_resample(X, y)

In [5]:
#Apply random over-sampling
ros = RandomOverSampler()
X_ROS, y_ROS = ros.fit_resample(X, y)

In [6]:
#Apply Synthetic Minority Oversampling Technique
sm = SMOTE()
X_SMOTE, y_SMOTE = sm.fit_resample(X, y)

In [7]:
# Split into training and test datasets
X_RUS_train, X_RUS_test, y_RUS_train, y_RUS_test = train_test_split(X_RUS, y_RUS, test_size=0.3, random_state=42)

X_ROS_train, X_ROS_test, y_ROS_train, y_ROS_test = train_test_split(X_ROS, y_ROS, test_size=0.3, random_state=42)

X_SMOTE_train, X_SMOTE_test, y_SMOTE_train, y_SMOTE_test = train_test_split(X_SMOTE, y_SMOTE, test_size=0.3, random_state=42)


In [8]:
# Build a decision tree classifier
clf = tree.DecisionTreeClassifier(random_state=2017)
clf_rus = clf.fit(X_RUS_train, y_RUS_train)
clf_ros = clf.fit(X_ROS_train, y_ROS_train)
clf_smote = clf.fit(X_SMOTE_train, y_SMOTE_train)

In [9]:
# Evaluate model performance
print ("\nRUS - Train AUC : ",metrics.roc_auc_score(y_RUS_train, clf.predict(X_RUS_train)))
print ("RUS - Test AUC : ",metrics.roc_auc_score(y_RUS_test, clf.predict(X_RUS_test)))
print ("ROS - Train AUC : ",metrics.roc_auc_score(y_ROS_train, clf.predict(X_ROS_train)))
print ("ROS - Test AUC : ",metrics.roc_auc_score(y_ROS_test, clf.predict(X_ROS_test)))
print ("\nSMOTE - Train AUC : ",metrics.roc_auc_score(y_SMOTE_train, clf.predict(X_SMOTE_train)))
print ("SMOTE - Test AUC : ",metrics.roc_auc_score(y_SMOTE_test, clf.predict(X_SMOTE_test)))


RUS - Train AUC :  0.9819117984840364
RUS - Test AUC :  0.9616517338642774
ROS - Train AUC :  0.9912000000000001
ROS - Test AUC :  0.9308070957555494

SMOTE - Train AUC :  1.0
SMOTE - Test AUC :  0.9042975228542238
