In [None]:
# import statements
import pandas as pd
import numpy as np
from pandas import value_counts
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time
import os

In [None]:
# read csv as data frame 
df = pd.read_csv("coupon.csv")

# drop cars as it is mostly blank (108 are blank, 12576 are empty)
df = df.drop(columns=['car'])

# drop direction_opp as it is inverse of direction_same so its redundant
df = df.drop(columns=['direction_opp'])
# drop duplicates
df = df.drop_duplicates()

# list of columns with blank values
blank_columns = ['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']

# delete 42 common null values
df.dropna(subset=blank_columns, how='all', inplace=True)

# null impute based on average probalistic weightage in each of the classes
for column in blank_columns:
    weights = df[column].value_counts(normalize=True)
    df[column].fillna(pd.Series(np.random.choice(weights.index, size=len(df.index), p=weights.values.tolist())), inplace=True)

df.isnull().sum()
# Ordinal Encoding

# a common classes order for all some columns
amount_visited_order = ['never','less1','1~3','4~8','gt8']

# ordinal columns with their categories in order
ordinal_columns = [('temperature',['30','55','80']), 
                   ('time', ['7AM', '10AM', '2PM', '6PM', '10PM']), 
                   ('expiration', ['2h', '1d']),
                   ('gender', ['Male', 'Female']), # since its 2 values we can do ordinal encoding
                   ('age', ['below21','21','26','31','36','41','46','50plus']), 
                   ('education', ['Some High School', 'High School Graduate', 'Some college - no degree', 'Associates degree', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)']),
                   ('income', ['Less than $12500','$12500 - $24999','$25000 - $37499','$37500 - $49999','$50000 - $62499','$62500 - $74999','$75000 - $87499','$87500 - $99999','$100000 or More']),
                   ('Bar', amount_visited_order),
                   ('CoffeeHouse', amount_visited_order),
                   ('CarryAway', amount_visited_order),
                   ('RestaurantLessThan20', amount_visited_order),
                   ('Restaurant20To50', amount_visited_order)]

# apply the ordinal encoding
for column, categories in ordinal_columns:
    df[column] =  OrdinalEncoder(categories=[categories], dtype=np.uint8).fit_transform(df[[column]])

# to check if it worked
# df.to_csv(os.path.join('PreprocessingData',"coupon_processed_2.csv"), index=False)
# Nominal Encoding

nomimal_columns = ['destination', 'passanger', 'weather', 'coupon', 'maritalStatus', 'occupation']

for column in nomimal_columns:
    df = pd.get_dummies(df, columns=[column]) # type: ignore

# to check if it worked
# df.to_csv(os.path.join('PreprocessingData',"coupon_processed_2.csv"), index=False)

# for machine learning, we need arrays so we extract y as array
y = df["Y"].to_numpy()

# drop the target column
df = df.drop(columns=["Y"]) 

# extract x as array
x = df.to_numpy()

## Decision Tree Classifier

In [None]:
# split into training (80%) and testing (20%)
x_train, x_test, y_train, y_test, = train_test_split(x,y, test_size=0.2)

# Create classifier decision tree
from sklearn.tree import DecisionTreeClassifier

# create decision tree classifer object
classifier = DecisionTreeClassifier(ccp_alpha=0.001)

# train decision tree classifer
classifier.fit(x_train, y_train)

# predict the response for test dataset
y_predicted = classifier.predict(x_test)

# check accuracy
print(f"training accuracy {classifier.score(x_train, y_train)}")
print(f"testing accuracy {classifier.score(x_test, y_test)}")

print(f"node count {classifier.tree_.node_count}")
print(f"depth {classifier.get_depth()}")
print(f"number of leaves {classifier.get_n_leaves()}")

## Hyper Parameter Tuning

In [None]:
#tune

def tune(start, stop, step, classifier, x, y, tuning_range=None):
    data = []
    current_percent = 10
    start_time = time.time()
    if tuning_range is None:
        tuning_range = range(start, stop, step) if type(start) == int else np.arange(start, stop, step)
    for i in tuning_range:
        node_count = []
        testing_accuracy = []
        training_accuracy = []
        for _ in range(50):
            
            # create decision tree classifer object
            classifier_tune = classifier(i)

            # split data
            x_train, x_test, y_train, y_test, = train_test_split(x,y, test_size=0.2)

            # train decision tree classifer
            classifier_tune.fit(x_train, y_train)

            # get traning and testing accuracy
            node_count.append(classifier_tune.tree_.node_count)
            training_accuracy.append(classifier_tune.score(x_train, y_train))
            testing_accuracy.append(classifier_tune.score(x_test, y_test))

        
        # get average of 50 runs
        node_count = sum(node_count) / len(node_count)
        training_accuracy = sum(training_accuracy) / len(training_accuracy)
        testing_accuracy = sum(testing_accuracy) / len(testing_accuracy)

        data.append([i, node_count, training_accuracy, testing_accuracy])

        if step != 0:
            if (i - start) // step > ((stop - start) // step + 1) * current_percent/100:
                print(f"{current_percent}% done at {(time.time()-start_time)/60} minutes")
                current_percent += 10

    return data


def plot(data, title):
    # convert data to data frame
    df = pd.DataFrame(data, columns=["param", "node_count", "training_accuracy", "testing_accuracy"])

    # plot node count vs training accuracy and testing accuracy and label the graph
    df.drop(columns=['param']).plot(x="node_count", y=["training_accuracy", "testing_accuracy"])
    plt.xlabel("node count")
    plt.ylabel("accuracy")
    plt.title(title)
    plt.show()

def save_to_csv(model, name):
    df = pd.DataFrame(model, columns=[name, "node_count", "training_accuracy", "testing_accuracy"])
    
    if not os.path.exists("TuningData"):
        os.makedirs("TuningData")
        
    df.to_csv(os.path.join("TuningData", name + ".csv"), index=False)
    return df

In [None]:
# min sample split
min_sample_split = tune(2, 2000, 5, lambda i: DecisionTreeClassifier(min_samples_split=i), x, y)
plot(min_sample_split, "min sample split")

min_sample_split_df = save_to_csv(min_sample_split, "min_sample_split")


In [None]:
# criterion{“gini”, “entropy”, “log_loss”}

criterion = tune(0, 0, 0, lambda i: DecisionTreeClassifier(criterion=i), x, y, ["gini", "entropy", "log_loss"])

criterion_df = save_to_csv(criterion, "criterion")

criterion_df.head()

In [None]:
# splitter{“best”, “random”}

splitter = tune(0, 0, 0, lambda i: DecisionTreeClassifier(splitter=i), x, y, ["best", "random", "random"])

splitter_df = save_to_csv(splitter, "splitter")

splitter_df.head()

In [None]:
# max_depth
# with default params: 41
max_depth = tune(1, 60, 2, lambda i: DecisionTreeClassifier(max_depth=i), x, y)
plot(max_depth, "max depth")

max_depth_df = save_to_csv(max_depth, "max_depth")

In [None]:
# min_samples_leaf 
# The minimum number of samples required to split an internal node:
min_samples_leaf = tune(2, 5000, 10, lambda i: DecisionTreeClassifier(min_samples_leaf=i), x, y)
plot(min_samples_leaf, "min samples leaf")

min_samples_leaf_df = save_to_csv(min_samples_leaf, "min_samples_leaf")


In [None]:
# min_weight_fraction_leaf
# The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. 
# Samples have equal weight when sample_weight is not provided.

min_weight_fraction_leaf = tune(0.0, 0.1, 0.0001, lambda i: DecisionTreeClassifier(min_weight_fraction_leaf=i), x, y)
plot(min_weight_fraction_leaf, "min_weight_fraction_leaf")

min_weight_fraction_leaf_df = save_to_csv(min_weight_fraction_leaf, "min_weight_fraction_leaf")

In [None]:
# max_features
# If int, then consider max_features features at each split.
max_features = tune(1, x.shape[1]+1, 1, lambda i: DecisionTreeClassifier(max_features=i), x, y)
plot(max_features, "max_features")

max_features_df = save_to_csv(max_features, "max_features")

In [None]:
# max_leaf_nodes
# with default params: 2716

max_leaf_nodes = tune(2, 3000, 10, lambda i: DecisionTreeClassifier(max_leaf_nodes=i), x, y)
plot(max_leaf_nodes, "max_leaf_nodes")

max_leaf_nodes_df = save_to_csv(max_leaf_nodes, "max_leaf_nodes")

In [None]:
# min_impurity_decrease

min_impurity_decrease = tune(0.0, 0.005, 0.00001, lambda i: DecisionTreeClassifier(criterion='entropy', min_impurity_decrease=i), x, y)
plot(min_impurity_decrease, "min_impurity_decrease")

min_impurity_decrease_df = save_to_csv(min_impurity_decrease, "min_impurity_decrease (entropy)")

In [None]:
# ccp_alpha

ccp_alpha = tune(0.0, 0.01, 0.00001, lambda i: DecisionTreeClassifier(ccp_alpha=i), x, y)
plot(ccp_alpha, "ccp_alpha")

ccp_alpha_df = save_to_csv(ccp_alpha, "ccp_alpha")

In [None]:
from sklearn.tree import DecisionTreeClassifier

max = 0

# Create a decision tree classifier object
dtc_tuned = DecisionTreeClassifier(
                                criterion='entropy',
                                splitter='best', 
                                max_depth=60, 
                                min_samples_split=160, 
                                min_samples_leaf=80, 
                                min_weight_fraction_leaf=0.007,
                                max_features=60, 
                                max_leaf_nodes=190, 
                                min_impurity_decrease=0.00065,
                                ccp_alpha=0.00031
                        )


from sklearn.model_selection import cross_validate

cv_results = cross_validate(dtc_tuned, x, y, cv=10)

print(np.mean(cv_results['test_score']) )
