# Costco Location Resampling Techniques

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
#from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN

In [None]:
# import os
# # Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# # For example:
# # spark_version = 'spark-3.0.3'
# spark_version = 'spark-3.2.2'
# os.environ['SPARK_VERSION']=spark_version

# # Install Spark and Java
# !apt-get update
# !apt-get install openjdk-11-jdk-headless -qq > /dev/null
# !wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
# !tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
# !pip install -q findspark

# # Set Environment Variables
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
# os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# # Start a SparkSession
# import findspark
# findspark.init()

In [None]:
# Not using postgres - using S3 bucket and csv file.  DO WE NEED THIS?
#!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

In [None]:
# Not using postgres - DO WE NEED THIS?
# from pyspark.sql import SparkSession
# spark = SparkSession.builder.appName("CloudETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

# Read the CSV and Perform Basic Data Cleaning

In [None]:
# url="https://<bucket name>.s3.amazonaws.com/____.csv"  #Enter you S3 bucket URL (can be found in the summary of your bucket item in AWS)
# spark.sparkContext.addFile(url)
# _df = spark.read.csv(SparkFiles.get("____.csv"), sep=",", header=True, inferSchema=True)

# # Show DataFrame
# _df.show()

In [None]:
columns = [
    "", "", 
]

target = [""]

In [None]:
# Load the data
file_path = Path('../Resources/.CSV OR DATABASE CONNECTION/PATH')
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `TARGET COLUMN - COSTCO LOCATION` WITH "YES" HAS A LOCATION status
issued_mask = df['? TARGET COLUMN = COSTCO HEARING AID LOCATION ?'] != '? TARGET COLUMN THAT HAS A COSTCO HEARING AID LOCATION'
df = df.loc[issued_mask]

# convert ANY DECIMAL COLUMNS to numerical
df['   '] = df['   '].str.replace('%', '')
df['   '] = df['  '].astype('float') / 100


# ?? Convert the target column values to YES_LOCATION and NO_LOCATION based on their values
x = {'YES': 'YES_LOCATION'}   
df = df.replace(x)

?? CONVERT AGE GROUPS EVALUATING TO ONE VALUE/COLUMN?
#x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
#df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

In [None]:
# Check which columns are non integer/float 
df.dtypes[df.dtypes != 'int64'][df.dtypes != 'float64']

In [None]:
# Create features and target &
# Use get_dummies method to convert string values into numerical values
X = df.copy()
X = X.drop(columns="?? COSTCO HEARING AID LOCATION")
X = pd.get_dummies(X)

y = df.loc[:, "?? COSTCO HEARING AID LOCATION"]


In [None]:
X.describe()

# Scale the Data

In [None]:
# Create a Standard Scaler instance
data_scaler = StandardScaler()

In [None]:
# Preview the scaled data
df_encoded_scaled[:5]


# Split the Data into Training and Testing

In [None]:
# Check the balance of our target values
#y['TARGET'].value_counts()  #code given to us in starter - threw error

y.value_counts()

In [None]:
# Create X_train, X_test, y_train, y_test
X_train,X_test,y_train,y_test=train_test_split(X,y, random_state=1)
#X_train.shape
#Counter(y_train)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Combination (Over and Under) Sampling

In this section, you will test a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. You will resample the data using the SMOTEENN algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [None]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [None]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
print("SMOTEENN Over and Under Sample Model - Balanced Accuracy Score")
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Confusion matrix as a table
smoteenn_undersample = confusion_matrix(y_test, y_pred)
smoteenn_undersample_df = pd.DataFrame(smoteenn_undersample, index = ["Actual 0", "Actual 1"], 
                                        columns = ["Predicted 0", "Predicted 1"])
print("SMOTEENN Over and Under Sample Model - Confusion Matrix")
smoteenn_undersample_df

In [None]:
# Print the imbalanced classification report
print("SMOTEENN Over and Under Sample Model - Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

SMOTEENN combines oversampling and undersampling.  Balances the reliance on the immediate neighbors of a data point with downsampling issue of loss of data.  
1. Oversample the minority class with SMOTE and 
2. Clean the resulting data with an undersampling strategy. If the two nearest neighbors of a data point belong to two different classes, that data point is dropped.