# Project: Discount Classification
Objective: Build a model to classify whether a course will be discounted <br> 
Models: Decision Trees, Random Forest, Logistic Regression, Naive Bayes, Support Vector Machines
Evaluation Metrics: Classification Accuracy, Receiver Operating Characteristic (ROC) Curve 

## Packages

In [41]:
# Data Handing 
import pandas as pd
import numpy as np

# Preprocessing 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.neighbors import LocalOutlierFactor

# Model
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB
from sklearn.svm import SVC
from optbinning import OptimalBinning

# Evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

## Import data

In [2]:
# Import data
df_raw = pd.read_csv("./Data/Udemy_Clean.csv", index_col=0)
df_raw.head()

Unnamed: 0,Title,Overall_Rating,Best_Rating,Worst_Rating,No_of_Ratings,Category,Subcategory,Topic,Instructor,Language,SkillsFuture,No_of_Practice_Test,No_of_Articles,No_of_Coding_Exercises,Video_Duration_Hr,No_of_Additional_Resources,Bestseller,Price,Discounted_Price
0,Complete Hypnotherapy & Hypnosis Certification...,4.7,5,0.5,3524,Lifestyle,Esoteric Practices,Hypnotherapy,Dr Karen E Wells,English,False,0,4,0,3.0,0,Yes,104.98,26.0
1,Pinterest Marketing for Wedding Professionals ...,5.0,5,0.5,1,Marketing,Social Media Marketing,Pinterest Marketing,Staci Nichols,English,False,0,0,0,0.6,2,No,29.98,22.0
2,Master the Telephone Sales- Cold calling Secrets,4.5,5,0.5,3,Marketing,Product Marketing,Marketing Strategy,Sanjay Bhasin,English,False,0,0,0,0.733333,0,No,29.98,22.0
3,5 Practical Management concepts you MUST know,5.0,5,0.5,2,Personal Development,Leadership,Management Skills,Vasudev Murthy,English,False,0,0,0,2.0,0,No,49.98,22.0
4,Fermented Foods Mastery,4.5,5,0.5,187,Health & Fitness,Nutrition,Fermented Foods,Kale Brock,English,False,0,3,0,1.5,12,No,68.98,24.0


In [3]:
df_preprocessing = df_raw.copy()

# Transform discount 
df_preprocessing["Discount_d"] = (df_preprocessing["Discounted_Price"] != df_preprocessing["Price"])
mapper = {True: 1, False: 0}
df_preprocessing["Discounted"] = df_preprocessing["Discount_d"].map(mapper)

# Drop irrelevant columns and rows
df_preprocessing.drop(columns=["Title", "Discounted_Price", "Discount_d"], inplace=True)
df_preprocessing.drop(df_preprocessing[df_preprocessing["Category"] == "Leadership & Management"].index, inplace=True)


df_preprocessing.head()

Unnamed: 0,Overall_Rating,Best_Rating,Worst_Rating,No_of_Ratings,Category,Subcategory,Topic,Instructor,Language,SkillsFuture,No_of_Practice_Test,No_of_Articles,No_of_Coding_Exercises,Video_Duration_Hr,No_of_Additional_Resources,Bestseller,Price,Discounted
0,4.7,5,0.5,3524,Lifestyle,Esoteric Practices,Hypnotherapy,Dr Karen E Wells,English,False,0,4,0,3.0,0,Yes,104.98,1
1,5.0,5,0.5,1,Marketing,Social Media Marketing,Pinterest Marketing,Staci Nichols,English,False,0,0,0,0.6,2,No,29.98,1
2,4.5,5,0.5,3,Marketing,Product Marketing,Marketing Strategy,Sanjay Bhasin,English,False,0,0,0,0.733333,0,No,29.98,1
3,5.0,5,0.5,2,Personal Development,Leadership,Management Skills,Vasudev Murthy,English,False,0,0,0,2.0,0,No,49.98,1
4,4.5,5,0.5,187,Health & Fitness,Nutrition,Fermented Foods,Kale Brock,English,False,0,3,0,1.5,12,No,68.98,1


## Preprocessing

In [4]:
# Category reduction
subcategory_reduced = df_preprocessing["Subcategory"].value_counts()[:20].index.to_list()
topic_reduced = df_preprocessing["Topic"].value_counts()[:20].index.to_list()
instructor_reduced = df_preprocessing["Instructor"].value_counts()[:20].index.to_list()

# Defining functions
def subcategory_reduction(subcategory): 
    if subcategory not in subcategory_reduced: 
        return "Others"
    else: 
        return subcategory

def topic_reduction(topic): 
    if topic not in topic_reduced: 
        return "Others"
    else: 
        return topic

def instructor_reduction(instructor): 
    if instructor not in instructor_reduced: 
        return "Others"
    else: 
        return instructor

df_preprocessing["Subcategory"] = df_preprocessing["Subcategory"].apply(lambda x: subcategory_reduction(x))
df_preprocessing["Topic"] = df_preprocessing["Topic"].apply(lambda x: topic_reduction(x))
df_preprocessing["Instructor"] = df_preprocessing["Instructor"].apply(lambda x: instructor_reduction(x))

In [5]:
# Define response and explanatory variables
y = df_preprocessing["Discounted"]
X = df_preprocessing.drop(columns=["Discounted"])

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Split into categorical and numeric for preprocessing 
X_train_categorical = X_train.select_dtypes(include=["object","bool"])
X_test_categorical = X_test.select_dtypes(include=["object","bool"])
X_train_numeric = X_train.select_dtypes(exclude=["object","bool"])
X_test_numeric = X_test.select_dtypes(exclude=["object","bool"])

### Categorical Variables

In [12]:
# Encoding 
ordinal_encoder = OrdinalEncoder()
X_train_categorical_encoded = pd.DataFrame(ordinal_encoder.fit_transform(X_train_categorical), 
                                           columns=X_train_categorical.columns, index=X_train_categorical.index)
X_test_categorical_encoded = pd.DataFrame(ordinal_encoder.transform(X_test_categorical), 
                                          columns=X_test_categorical.columns, index=X_test_categorical.index)

In [13]:
X_train_categorical_encoded.head()

Unnamed: 0,Category,Subcategory,Topic,Instructor,Language,SkillsFuture,Bestseller
10047,1.0,15.0,5.0,13.0,0.0,0.0,0.0
234,5.0,15.0,5.0,13.0,0.0,1.0,0.0
11431,1.0,15.0,5.0,13.0,0.0,0.0,1.0
987,12.0,10.0,5.0,13.0,0.0,0.0,0.0
14492,5.0,14.0,5.0,13.0,0.0,0.0,0.0


In [15]:
# Combine back both dataframes 
X_train_processed = pd.concat([X_train_numeric, X_train_categorical_encoded], axis=1)
X_test_processed = pd.concat([X_test_numeric, X_test_categorical_encoded], axis=1)

### Outliers

In [51]:
# Outlier Detection
lof = LocalOutlierFactor()
yhat = pd.DataFrame({"Outlier_d": lof.fit_predict(X_train_processed)}, index=X_train_processed.index)
outlier_index = yhat.loc[yhat["Outlier_d"]==-1, :].index

# Outlier Removal 
X_train_processed.drop(outlier_index, inplace=True)
y_train.drop(outlier_index, inplace=True)