# Lecture 13 (b)
In this example, we will build different classifiers (using **sampling methods**) to classify an **imbalanced dataset** of iris flowers into three species (setosa, versicolor, or virginica) based on the length and width of the petals and sepals.

In [None]:
# Install packages using Anaconda:
# run 'pip install -U imbalanced-learn'

In [None]:
# Load libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  
from sklearn import tree
from sklearn import metrics
from imblearn.datasets import make_imbalance
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [None]:
# Load dataset
data = sns.load_dataset("iris")
data.head()

In [None]:
# Explore class labels
data['species'].value_counts()

In [None]:
# Create imbalanced dataset
x, y = make_imbalance(data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], 
                      data['species'], 
                      sampling_strategy = {'virginica':10, 'versicolor':20, 'setosa':50},
                      random_state = 9)
Counter(y)

In [None]:
# Partition dataset into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state = 0)
Counter(y_train)

In [None]:
# Standardize data
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train) 
x_test = scaler.transform(x_test)

In [None]:
# Build decision tree
model = tree.DecisionTreeClassifier(criterion = "entropy", random_state = 0)
model.fit(x_train, y_train)

In [None]:
# Predict class labels using decision tree
y_pred = model.predict(x_test)

In [None]:
# Compute confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
# Plot confusion matrix
sns.heatmap(conf_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix')
plt.tight_layout()

In [None]:
# Balance dataset (using random undersampling)
undersample = RandomUnderSampler(sampling_strategy = "not minority", random_state = 0)
x_balanced, y_balanced = undersample.fit_resample(x_train, y_train)
Counter(y_balanced)

In [None]:
# Build decision tree
model = tree.DecisionTreeClassifier(criterion = "entropy", random_state = 0)
model.fit(x_balanced, y_balanced)

In [None]:
# Predict class labels using decision tree
y_pred = model.predict(x_test)

In [None]:
# Compute confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
# Plot confusion matrix
sns.heatmap(conf_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix')
plt.tight_layout()

In [None]:
# Balance dataset (using random oversampling)
oversample = RandomOverSampler(sampling_strategy = "not majority", random_state = 0)
x_balanced, y_balanced = oversample.fit_resample(x_train, y_train)
Counter(y_balanced)

In [None]:
# Build decision tree
model = tree.DecisionTreeClassifier(criterion = "entropy", random_state = 0)
model.fit(x_balanced, y_balanced)

In [None]:
# Predict class labels using decision tree
y_pred = model.predict(x_test)

In [None]:
# Compute confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
# Plot confusion matrix
sns.heatmap(conf_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix')
plt.tight_layout()