# Project: Discount Classification
Objective: Build a model to classify whether a course will be discounted <br> 
Models: Decision Trees, Random Forest, Logistic Regression, Naive Bayes, Support Vector Machines <br>
Evaluation Metrics: Classification Accuracy, ROC Area Under Curve

## Packages

In [10]:
# Data Handing 
import pandas as pd
import numpy as np

# Preprocessing 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.neighbors import LocalOutlierFactor

# Model
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB
from sklearn.svm import SVC
from optbinning import OptimalBinning

# Evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

## Import data

In [11]:
# Import data
df_raw = pd.read_csv("./Data/Udemy_Clean.csv", index_col=0)
df_raw.head()

Unnamed: 0,Title,Overall_Rating,Best_Rating,Worst_Rating,No_of_Ratings,Category,Subcategory,Topic,Instructor,Language,SkillsFuture,No_of_Practice_Test,No_of_Articles,No_of_Coding_Exercises,Video_Duration_Hr,No_of_Additional_Resources,Bestseller,Price,Discounted_Price
0,Complete Hypnotherapy & Hypnosis Certification...,4.7,5,0.5,3524,Lifestyle,Esoteric Practices,Hypnotherapy,Dr Karen E Wells,English,False,0,4,0,3.0,0,Yes,104.98,26.0
1,Pinterest Marketing for Wedding Professionals ...,5.0,5,0.5,1,Marketing,Social Media Marketing,Pinterest Marketing,Staci Nichols,English,False,0,0,0,0.6,2,No,29.98,22.0
2,Master the Telephone Sales- Cold calling Secrets,4.5,5,0.5,3,Marketing,Product Marketing,Marketing Strategy,Sanjay Bhasin,English,False,0,0,0,0.733333,0,No,29.98,22.0
3,5 Practical Management concepts you MUST know,5.0,5,0.5,2,Personal Development,Leadership,Management Skills,Vasudev Murthy,English,False,0,0,0,2.0,0,No,49.98,22.0
4,Fermented Foods Mastery,4.5,5,0.5,187,Health & Fitness,Nutrition,Fermented Foods,Kale Brock,English,False,0,3,0,1.5,12,No,68.98,24.0


In [12]:
df_preprocessing = df_raw.copy()

# Transform discount 
df_preprocessing["Discount_d"] = (df_preprocessing["Discounted_Price"] != df_preprocessing["Price"])
mapper = {True: 1, False: 0}
df_preprocessing["Discounted"] = df_preprocessing["Discount_d"].map(mapper)

# Drop irrelevant columns and rows
df_preprocessing.drop(columns=["Title", "Discounted_Price", "Discount_d"], inplace=True)
df_preprocessing.drop(df_preprocessing[df_preprocessing["Category"] == "Leadership & Management"].index, inplace=True)


df_preprocessing.head()

Unnamed: 0,Overall_Rating,Best_Rating,Worst_Rating,No_of_Ratings,Category,Subcategory,Topic,Instructor,Language,SkillsFuture,No_of_Practice_Test,No_of_Articles,No_of_Coding_Exercises,Video_Duration_Hr,No_of_Additional_Resources,Bestseller,Price,Discounted
0,4.7,5,0.5,3524,Lifestyle,Esoteric Practices,Hypnotherapy,Dr Karen E Wells,English,False,0,4,0,3.0,0,Yes,104.98,1
1,5.0,5,0.5,1,Marketing,Social Media Marketing,Pinterest Marketing,Staci Nichols,English,False,0,0,0,0.6,2,No,29.98,1
2,4.5,5,0.5,3,Marketing,Product Marketing,Marketing Strategy,Sanjay Bhasin,English,False,0,0,0,0.733333,0,No,29.98,1
3,5.0,5,0.5,2,Personal Development,Leadership,Management Skills,Vasudev Murthy,English,False,0,0,0,2.0,0,No,49.98,1
4,4.5,5,0.5,187,Health & Fitness,Nutrition,Fermented Foods,Kale Brock,English,False,0,3,0,1.5,12,No,68.98,1


## Preprocessing

In [13]:
# Category reduction
subcategory_reduced = df_preprocessing["Subcategory"].value_counts()[:20].index.to_list()
topic_reduced = df_preprocessing["Topic"].value_counts()[:20].index.to_list()
instructor_reduced = df_preprocessing["Instructor"].value_counts()[:20].index.to_list()

# Defining functions
def subcategory_reduction(subcategory): 
    if subcategory not in subcategory_reduced: 
        return "Others"
    else: 
        return subcategory

def topic_reduction(topic): 
    if topic not in topic_reduced: 
        return "Others"
    else: 
        return topic

def instructor_reduction(instructor): 
    if instructor not in instructor_reduced: 
        return "Others"
    else: 
        return instructor

df_preprocessing["Subcategory"] = df_preprocessing["Subcategory"].apply(lambda x: subcategory_reduction(x))
df_preprocessing["Topic"] = df_preprocessing["Topic"].apply(lambda x: topic_reduction(x))
df_preprocessing["Instructor"] = df_preprocessing["Instructor"].apply(lambda x: instructor_reduction(x))

In [14]:
# Define response and explanatory variables
y = df_preprocessing["Discounted"]
X = df_preprocessing.drop(columns=["Discounted"])

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Split into categorical and numeric for preprocessing 
X_train_categorical = X_train.select_dtypes(include=["object","bool"])
X_test_categorical = X_test.select_dtypes(include=["object","bool"])
X_train_numeric = X_train.select_dtypes(exclude=["object","bool"])
X_test_numeric = X_test.select_dtypes(exclude=["object","bool"])

### Categorical Variables

In [15]:
# Encoding 
ordinal_encoder = OrdinalEncoder()
X_train_categorical_encoded = pd.DataFrame(ordinal_encoder.fit_transform(X_train_categorical), 
                                           columns=X_train_categorical.columns, index=X_train_categorical.index)
X_test_categorical_encoded = pd.DataFrame(ordinal_encoder.transform(X_test_categorical), 
                                          columns=X_test_categorical.columns, index=X_test_categorical.index)

In [16]:
X_train_categorical_encoded.head()

Unnamed: 0,Category,Subcategory,Topic,Instructor,Language,SkillsFuture,Bestseller
10047,1.0,15.0,5.0,13.0,0.0,0.0,0.0
234,5.0,15.0,5.0,13.0,0.0,1.0,0.0
11431,1.0,15.0,5.0,13.0,0.0,0.0,1.0
987,12.0,9.0,5.0,13.0,0.0,0.0,0.0
14492,5.0,14.0,5.0,13.0,0.0,0.0,0.0


In [17]:
# Combine back both dataframes 
X_train_processed = pd.concat([X_train_numeric, X_train_categorical_encoded], axis=1)
X_test_processed = pd.concat([X_test_numeric, X_test_categorical_encoded], axis=1)

### Outliers

In [18]:
# Outlier Detection
lof = LocalOutlierFactor()
yhat = pd.DataFrame({"Outlier_d": lof.fit_predict(X_train_processed)}, index=X_train_processed.index)
outlier_index = yhat.loc[yhat["Outlier_d"]==-1, :].index

# Outlier Removal 
X_train_processed.drop(outlier_index, inplace=True)
y_train = y_train.drop(outlier_index)

## Model Development 

### (1) Decision Tree Classifier

In [19]:
# Initial model
model_DTC = DecisionTreeClassifier(random_state=1)
model_DTC.fit(X_train_processed, y_train)
predictions_DTC = model_DTC.predict(X_test_processed)
auc_score_DTC = roc_auc_score(y_test, predictions_DTC)
print("Score: {0:0.5f}".format(auc_score_DTC))

Score: 0.63639


In [20]:
# 1st model improvement 
for alpha in range(0, 11, 1): 
    alpha = alpha / 10000
    model_DTC1 = DecisionTreeClassifier(ccp_alpha=alpha, random_state=1)
    model_DTC1.fit(X_train_processed, y_train)
    predictions_DTC1 = model_DTC1.predict(X_test_processed)
    auc_score_DTC1 = roc_auc_score(y_test, predictions_DTC1)
    print("alpha: {0}, Score: {1:0.5f}".format(alpha, auc_score_DTC1))

alpha: 0.0, Score: 0.63639
alpha: 0.0001, Score: 0.64104
alpha: 0.0002, Score: 0.62745
alpha: 0.0003, Score: 0.61418
alpha: 0.0004, Score: 0.60875
alpha: 0.0005, Score: 0.60875
alpha: 0.0006, Score: 0.60875
alpha: 0.0007, Score: 0.59814
alpha: 0.0008, Score: 0.59814
alpha: 0.0009, Score: 0.58886
alpha: 0.001, Score: 0.58886


Optimal alpha = 0.0001

In [21]:
# Final Model 
DTC_model = DecisionTreeClassifier(ccp_alpha=0.0001, random_state=1)
DTC_model.fit(X_train_processed, y_train)
DTC_predictions = DTC_model.predict(X_test_processed)

### (2) Random Forest Classifier

In [22]:
# Inital model
model_RFC = RandomForestClassifier(random_state=0)
model_RFC.fit(X_train_processed, y_train)
predictions_RFC = model_RFC.predict(X_test_processed)
auc_score_RFC = roc_auc_score(y_test, predictions_RFC)
print("Score: {0:0.5f}".format(auc_score_RFC))

Score: 0.60578


In [24]:
# First model improvement
for n in range(1, 11, 1):
    model_RFC1 = RandomForestClassifier(n_estimators=n, random_state=0)
    model_RFC1.fit(X_train_processed, y_train)
    predictions_RFC1 = model_RFC1.predict(X_test_processed)
    auc_score_RFC1 = roc_auc_score(y_test, predictions_RFC1)
    print("n_esimators: {0}, Score: {1:0.5f}".format(n, auc_score_RFC1))

n_esimators: 1, Score: 0.61104
n_esimators: 2, Score: 0.65464
n_esimators: 3, Score: 0.62253
n_esimators: 4, Score: 0.63882
n_esimators: 5, Score: 0.61885
n_esimators: 6, Score: 0.62551
n_esimators: 7, Score: 0.60779
n_esimators: 8, Score: 0.61554
n_esimators: 9, Score: 0.60856
n_esimators: 10, Score: 0.61265


Optimal n_estimators = 2

In [32]:
# Final Model 
RFC_model = RandomForestClassifier(n_estimators=2, random_state=0)
RFC_model.fit(X_train_processed, y_train)
RFC_predictions = RFC_model.predict(X_test_processed)

### (3) Logistic Regression

In [33]:
# Inital Model 
model_LR = LogisticRegression(solver="liblinear", random_state=1)
model_LR.fit(X_train_processed, y_train)
predictions_LR = model_LR.predict(X_test_processed)
auc_score_LR = roc_auc_score(y_test, predictions_LR)
print("Score: {0:0.5f}".format(auc_score_LR))

Score: 0.50000


In [73]:
# Final model 
LR_model = LogisticRegression(solver="liblinear", random_state=1)
LR_model.fit(X_train_processed, y_train)
LR_predictions = LR_model.predict(X_test_processed)

### (4) Naive Bayes

In [74]:
# Gaussian Naive Bayes
model_GNB = GaussianNB()
model_GNB.fit(X_train_processed, y_train)
predictions_GNB = model_GNB.predict(X_test_processed)
auc_score_GNB = roc_auc_score(y_test, predictions_GNB)
print("Score: {0:0.5f}".format(auc_score_GNB))

Score: 0.53322


In [75]:
# Multinomial Naive Bayes 
model_MNB = MultinomialNB()
model_MNB.fit(X_train_processed, y_train)
predictions_MNB = model_MNB.predict(X_test_processed)
auc_score_MNB = roc_auc_score(y_test, predictions_MNB)
print("Score: {0:0.5f}".format(auc_score_MNB))

Score: 0.54897


In [76]:
# Complement Naive Bayes 
model_CoNB = ComplementNB()
model_CoNB.fit(X_train_processed, y_train)
predictions_CoNB = model_CoNB.predict(X_test_processed)
auc_score_CoNB = roc_auc_score(y_test, predictions_CoNB)
print("Score: {0:0.5f}".format(auc_score_CoNB))

Score: 0.54754


In [77]:
# Bernoulli Naive Bayes 
model_BNB = BernoulliNB()
model_BNB.fit(X_train_processed, y_train)
predictions_BNB = model_BNB.predict(X_test_processed)
auc_score_BNB = roc_auc_score(y_test, predictions_BNB)
print("Score: {0:0.5f}".format(auc_score_BNB))

Score: 0.50663


In [78]:
# Final model 
NB_model = MultinomialNB()
NB_model.fit(X_train_processed, y_train)
NB_predictions = NB_model.predict(X_test_processed)

### (5) Support Vector Classification

In [79]:
# Initial model 
model_SVC = SVC(random_state=1)
model_SVC.fit(X_train_processed, y_train)
predictions_SVC = model_SVC.predict(X_test_processed)
auc_score_SVC = roc_auc_score(y_test, predictions_SVC)
print("Score: {0:0.5f}".format(auc_score_SVC))

Score: 0.50000


In [80]:
# First model improvement
model_SVC1 = SVC(gamma="auto", random_state=1)
model_SVC1.fit(X_train_processed, y_train)
predictions_SVC1 = model_SVC1.predict(X_test_processed)
auc_score_SVC1 = roc_auc_score(y_test, predictions_SVC1)
print("Score: {0:0.5f}".format(auc_score_SVC1))

Score: 0.54100


In [81]:
# Final model 
SVC_model = SVC(gamma="auto", random_state=2)
SVC_model.fit(X_train_processed, y_train)
SVC_predictions = SVC_model.predict(X_test_processed)

## Model Evaluation

We will be using the accuracy score to calculate how accurate the classification of the predictions compare with the actual results. 

In [82]:
# Calculate score 
DTC_score = accuracy_score(y_test, DTC_predictions)
RFC_score = accuracy_score(y_test, RFC_predictions)
LR_score = accuracy_score(y_test, LR_predictions)
NB_score = accuracy_score(y_test, NB_predictions)
SVC_score = accuracy_score(y_test, SVC_predictions)

# Comparison of scores
print("Model: Decision Tree, Score: {0:0.5f}".format(DTC_score))
print("Model: Random Forest, Score: {0:0.5f}".format(RFC_score))
print("Model: Logistic Regression, Score: {0:0.5f}".format(LR_score))
print("Model: Naive Bayes, Score: {0:0.5f}".format(NB_score))
print("Model: Support Vector, Score: {0:0.5f}".format(SVC_score))

Model: Decision Tree, Score: 0.90992
Model: Random Forest, Score: 0.85190
Model: Logistic Regression, Score: 0.92351
Model: Naive Bayes, Score: 0.23210
Model: Support Vector, Score: 0.92960


The model with the most accurate predictions is the Support Vector Classifier, followed by the Logistic Regression model, and then the Decision Tree Classifier. <br> 
The highest accuracy score attained is 92.96% which is relatively good. 