In [None]:
# Costco Location Ensemble Techniques

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.metrics import balanced_accuracy_score

# Read the CSV and Perform Basic Data Cleaning

In [None]:
columns = [
    "", "", 
]

target = [""]

In [None]:
# Load the data
file_path = Path('../Resources/.CSV OR DATABASE CONNECTION/PATH')
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `TARGET COLUMN - COSTCO LOCATION` WITH "YES" HAS A LOCATION status
issued_mask = df['? TARGET COLUMN = COSTCO HEARING AID LOCATION ?'] != '? TARGET COLUMN THAT HAS A COSTCO HEARING AID LOCATION'
df = df.loc[issued_mask]

# convert ANY DECIMAL COLUMNS to numerical
df['   '] = df['   '].str.replace('%', '')
df['   '] = df['  '].astype('float') / 100


# Convert the target column values to YES_LOCATION and NO_LOCATION based on their values
x = {'YES': 'YES_LOCATION'}   
df = df.replace(x)

? CONVERT AGE GROUPS EVALUATING TO ONE VALUE/COLUMN ?
#x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
#df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

# Split the Data into Training and Testing

In [None]:
# Create our features
X = df.copy()
X = X.drop(columns="COSTCO HEARING AID LOCATION")
X = pd.get_dummies(X)

y = df.loc[:, "COSTCO HEARING AID LOCATION"]

In [None]:
X.describe()

In [None]:
# Check the balance of our target values
y.value_counts()

In [None]:
# Create X_train, X_test, y_train, y_test
X_train,X_test,y_train,y_test=train_test_split(X,y, random_state=1)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
rf_model=RandomForestClassifier(n_estimators=100, random_state=1)

In [None]:
# Fitting the model
rf_model=rf_model.fit(X_train, y_train)

In [None]:
# Make the predictions using the test data
predictions=rf_model.predict(X_test)
predictions

In [None]:
# Calculated the balanced accuracy score
print("Balanced Random Forest Classifier - Accuracy Score:")
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score

In [None]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
print("Balanced Random Forest Classifier - Confusion Matrix")

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

In [None]:
# Print the imbalanced classification report
print("Balanced Random Forest Classifier - Classification Report")
print(classification_report(y_test, predictions))

In [None]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

In [None]:
# List the features sorted in descending order by feature importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

Instead of having a single, complex tree like the ones created by decision trees, a random forest algorithm will sample the data and build several smaller, simpler decision trees. Each tree is simpler because it is built from a random subset of features.

Random forest algorithms are beneficial because they:
•	Are robust against overfitting as all of those weak learners are trained on different pieces of the data.
•	Can be used to rank the importance of input variables in a natural way.
•	Can handle thousands of input variables without variable deletion.
•	Are robust to outliers and nonlinear data.
•	Run efficiently on large datasets.

### Easy Ensemble AdaBoost Classifier

In [None]:
# Train the EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=1, n_estimators=100)
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)

# print("Easy Ensemble AdaBoost Classifier - Classification Report")
# acc_score = balanced_accuracy_score(y_test, predictions)
# acc_score

In [None]:
# Calculated the balanced accuracy score
print("ADABoost EasyEnsembleClassifier - Accuracy Score:")
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score

In [None]:
# Display the confusion matrix
#print(confusion_matrix(y_test, y_pred))
cm = confusion_matrix(y_test, predictions)

In [None]:
# Create a DataFrame from the confusion matrix.
#print("ADABoost EasyEnsembleClassifier - Confusion Matrix")
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

In [None]:
# Print the imbalanced classification report
print("ADABoost EasyEnsembleClassifier - Classification Report")
print(classification_report(y_test, predictions))

In AdaBoost, a model is trained then evaluated. After evaluating the errors of the first model, another model is trained. This time, however, the model gives extra weight to the errors from the previous model. The purpose of this weighting is to minimize similar errors in subsequent models. Then, the errors from the second model are given extra weight for the third model. This process is repeated until the error rate is minimized: