In [1]:
# import statements
import pandas as pd
import numpy as np
from pandas import value_counts
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time

In [2]:
# read csv as data frame 
df = pd.read_csv("coupon.csv")

print(df.shape)

# gives count, mean, srd, min, 25 percentile, 50 percentile, 75 percentile, max
df.info()

(12684, 26)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12684 entries, 0 to 12683
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   destination           12684 non-null  object
 1   passanger             12684 non-null  object
 2   weather               12684 non-null  object
 3   temperature           12684 non-null  int64 
 4   time                  12684 non-null  object
 5   coupon                12684 non-null  object
 6   expiration            12684 non-null  object
 7   gender                12684 non-null  object
 8   age                   12684 non-null  object
 9   maritalStatus         12684 non-null  object
 10  has_children          12684 non-null  int64 
 11  education             12684 non-null  object
 12  occupation            12684 non-null  object
 13  income                12684 non-null  object
 14  car                   108 non-null    object
 15  Bar                   12

In [3]:
# drop cars as it is mostly blank (108 are blank, 12576 are empty)
df = df.drop(columns=['car'])

# drop direction_opp as it is inverse of direction_same so its redundant
df = df.drop(columns=['direction_opp'])

# print number of duplicates
print("num duplicates", df.duplicated().sum())

df.isnull().sum()

num duplicates 74


destination               0
passanger                 0
weather                   0
temperature               0
time                      0
coupon                    0
expiration                0
gender                    0
age                       0
maritalStatus             0
has_children              0
education                 0
occupation                0
income                    0
Bar                     107
CoffeeHouse             217
CarryAway               151
RestaurantLessThan20    130
Restaurant20To50        189
toCoupon_GEQ5min          0
toCoupon_GEQ15min         0
toCoupon_GEQ25min         0
direction_same            0
Y                         0
dtype: int64

# Blanks and Duplicates
|column|number of blanks|most common class|
|------|----------------|-----------------|
|bar|107|never (5k)|
|CoffeeHouse|217|less1 (3.3k), 1~3 (3.2k)|
|CarryAway|151|1~3 (4.6k), 4~8 (4.3k)|
|RestaurantLessThan20|130|1~3(5.3k), 4~8 (3.5k)|
|Restaurant20To50|189|1~3 (5.3k), 4~8 (3.5k)|

### Other observations
- There are 72 duplicates
- There 42 common blanks between these 5

In [4]:
# drop duplicates
df = df.drop_duplicates()

# list of columns with blank values
blank_columns = ['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']

# delete 42 common null values
df.dropna(subset=blank_columns, how='all', inplace=True)

df.isnull().sum()

destination               0
passanger                 0
weather                   0
temperature               0
time                      0
coupon                    0
expiration                0
gender                    0
age                       0
maritalStatus             0
has_children              0
education                 0
occupation                0
income                    0
Bar                      65
CoffeeHouse             175
CarryAway               108
RestaurantLessThan20     87
Restaurant20To50        147
toCoupon_GEQ5min          0
toCoupon_GEQ15min         0
toCoupon_GEQ25min         0
direction_same            0
Y                         0
dtype: int64

In [5]:
# null impute based on weighted average of values ub rgar column
for column in blank_columns:
    weights = df[column].value_counts(normalize=True)
    df[column].fillna(pd.Series(np.random.choice(weights.index, size=len(df.index), p=weights.values.tolist())), inplace=True)

df.isnull().sum()

destination             0
passanger               0
weather                 0
temperature             0
time                    0
coupon                  0
expiration              0
gender                  0
age                     0
maritalStatus           0
has_children            0
education               0
occupation              0
income                  0
Bar                     0
CoffeeHouse             0
CarryAway               0
RestaurantLessThan20    0
Restaurant20To50        0
toCoupon_GEQ5min        0
toCoupon_GEQ15min       0
toCoupon_GEQ25min       0
direction_same          0
Y                       0
dtype: int64

# Encoding
| Column Name | Data Type |
|-------------|-----------|
| destination | nominal   |
| passanger   | nominal   |
| weather     | nominal   |
| temperature | ordinal   |
| time        | ordinal   |
| coupon      | nominal   |
| expiration  | ordinal   |
| gender      | nominal   |
| age         | ordinal   |
| maritalStatus | nominal |
| has_children | encoded  |
| education   | ordinal   |
| occupation  | nominal   |
| income      | ordinal   |
| Bar         | ordinal   |
| CoffeeHouse | ordinal   |
| CarryAway   | ordinal   |
| RestaurantLessThan20 | ordinal |
| Restaurant20To50 | ordinal |
| toCoupon_GEQ5min | encoded |
| toCoupon_GEQ15min | encoded |
| toCoupon_GEQ25min | encoded |
| direction_opp | encoded  |

In [6]:
# Ordinal Encoding

# a common classes order for all some columns
amount_visited_order = ['never','less1','1~3','4~8','gt8']

# ordinal columns with their categories in order
ordinal_columns = [('temperature',['30','55','80']), 
                   ('time', ['7AM', '10AM', '2PM', '6PM', '10PM']), 
                   ('expiration', ['2h', '1d']),
                   ('gender', ['Male', 'Female']), # since its 2 values we can do ordinal encoding
                   ('age', ['below21','21','26','31','36','41','46','50plus']), 
                   ('education', ['Some High School', 'High School Graduate', 'Some college - no degree', 'Associates degree', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)']),
                   ('income', ['Less than $12500','$12500 - $24999','$25000 - $37499','$37500 - $49999','$50000 - $62499','$62500 - $74999','$75000 - $87499','$87500 - $99999','$100000 or More']),
                   ('Bar', amount_visited_order),
                   ('CoffeeHouse', amount_visited_order),
                   ('CarryAway', amount_visited_order),
                   ('RestaurantLessThan20', amount_visited_order),
                   ('Restaurant20To50', amount_visited_order)]

# apply the ordinal encoding
for column, categories in ordinal_columns:
    df[column] =  OrdinalEncoder(categories=[categories]).fit_transform(df[[column]])

# to check if it worked
# df.to_csv('coupon_processed.csv')

In [7]:
df['occupation'].value_counts()

Unemployed                                   1849
Student                                      1575
Computer & Mathematical                      1372
Sales & Related                              1088
Education&Training&Library                    939
Management                                    815
Office & Administrative Support               638
Arts Design Entertainment Sports & Media      627
Business & Financial                          537
Retired                                       493
Food Preparation & Serving Related            298
Healthcare Practitioners & Technical          244
Healthcare Support                            242
Community & Social Services                   239
Legal                                         219
Transportation & Material Moving              218
Architecture & Engineering                    175
Personal Care & Service                       175
Protective Service                            174
Life Physical Social Science                  169


In [8]:
# Nominal Encoding

nomimal_columns = ['destination', 'passanger', 'weather', 'coupon', 'maritalStatus', 'occupation']

for column in nomimal_columns:
    df = pd.get_dummies(df, columns=[column]) # type: ignore

# to check if it worked
# df.to_csv('coupon_processed_2.csv', index=False)


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12568 entries, 0 to 12683
Data columns (total 63 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   temperature                                           12568 non-null  float64
 1   time                                                  12568 non-null  float64
 2   expiration                                            12568 non-null  float64
 3   gender                                                12568 non-null  float64
 4   age                                                   12568 non-null  float64
 5   has_children                                          12568 non-null  int64  
 6   education                                             12568 non-null  float64
 7   income                                                12568 non-null  float64
 8   Bar                                                   12

## Problems:
- occupation with OHE causes too many columns

# Classification

In [10]:
# for machine learning, we need arrays so we extract y as array
y = df["Y"].to_numpy()

# drop the target column
df = df.drop(columns=["Y"]) 

# extract x as array
x = df.to_numpy()

## Decision Tree Classifier

In [11]:
# split into training and testing
x_train, x_test, y_train, y_test, = train_test_split(x,y, test_size=0.2)

# Create classifier decision tree
from sklearn.tree import DecisionTreeClassifier

# create decision tree classifer object
dtc = DecisionTreeClassifier()

# train decision tree classifer
dtc.fit(x_train, y_train)

# predict the response for test dataset
y_predicted = dtc.predict(x_test)

# check accuracy
print(f"training accuracy {dtc.score(x_train, y_train)}")
print(f"testing accuracy {dtc.score(x_test, y_test)}")

print(f"node count {dtc.tree_.node_count}")
print(f"depth {dtc.get_depth()}")
print(f"number of leaves {dtc.get_n_leaves()}")

training accuracy 0.9988064451959419
testing accuracy 0.6813842482100239
node count 5275
depth 26
number of leaves 2638


In [19]:
#tune

def tune(start, stop, step, dtc, x, y):
    data = []
    current_percent = 10
    start_time = time.time()
    for i in range(start, stop, step):
        node_count = []
        testing_accuracy = []
        training_accuracy = []
        for _ in range(50):
            
            # create decision tree classifer object
            dtc_tune = dtc(i)

            # split data
            x_train, x_test, y_train, y_test, = train_test_split(x,y, test_size=0.2)

            # train decision tree classifer
            dtc_tune.fit(x_train, y_train)

            # get traning and testing accuracy
            node_count.append(dtc_tune.tree_.node_count)
            training_accuracy.append(dtc_tune.score(x_train, y_train))
            testing_accuracy.append(dtc_tune.score(x_test, y_test))

        
        # get average of 50 runs
        node_count = sum(node_count) / len(node_count)
        training_accuracy = sum(training_accuracy) / len(training_accuracy)
        testing_accuracy = sum(testing_accuracy) / len(testing_accuracy)

        data.append([node_count, training_accuracy, testing_accuracy])

        if (i - start) // step > ((stop - start) // step + 1) * current_percent/100:
            print(f"{current_percent}% done at {(time.time()-start_time)/60} minutes")
            current_percent += 10

    return data


def plot(data, title):
        # convert data to data frame
    df = pd.DataFrame(data, columns=["node_count", "training_accuracy", "testing_accuracy"])

    # plot node count vs training accuracy and testing accuracy and label the graph
    df.plot(x="node_count", y=["training_accuracy", "testing_accuracy"])
    plt.xlabel("node count")
    plt.ylabel("accuracy")
    plt.title(title)
    plt.show()

In [21]:
# min sample split
min_sample_split = tune(2, 2000, 5, lambda i: DecisionTreeClassifier(min_samples_split=i), x, y)
plot(min_sample_split, "min sample split")


10% done at 3.299119504292806 minutes
20% done at 5.610114181041718 minutes
30% done at 8.827525556087494 minutes
40% done at 10.698523497581482 minutes
50% done at 12.435900684197744 minutes
60% done at 14.392653294404347 minutes


In [None]:
# criterion{“gini”, “entropy”, “log_loss”}

In [None]:
# splitter{“best”, “random”}

In [None]:
# max_depth
# with default params: 41
max_depth = tune(1, 40, 1, lambda i: DecisionTreeClassifier(max_depth=i), x, y)
plot(max_depth, "max depth")

In [None]:
# min_samples_leaf 
# The minimum number of samples required to split an internal node:
min_samples_leaf = tune(2, 1000, 5, lambda i: DecisionTreeClassifier(min_samples_leaf=i), x, y)
plot(min_samples_leaf, "min samples leaf")

In [None]:
# min_weight_fraction_leaf
# The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. 
# Samples have equal weight when sample_weight is not provided.

min_weight_fraction_leaf = tune(0.0, 1.0, 0.05, lambda i: DecisionTreeClassifier(min_weight_fraction_leaf=i), x, y)
plot(min_weight_fraction_leaf, "min_weight_fraction_leaf")

In [None]:
# max_features int, float or {“auto”, “sqrt”, “log2”},

In [None]:
# max_leaf_nodes
# with default params: 2716

max_leaf_nodes = tune(0.0, 1.0, 0.05, lambda i: DecisionTreeClassifier(max_leaf_nodes=i), x, y)
plot(max_leaf_nodes, "max_leaf_nodes")

## KNN

In [None]:
# split into training and testing
x_train, x_test, y_train, y_test, = train_test_split(x,y, test_size=0.2)

# KNN
from sklearn.neighbors import KNeighborsClassifier

# create KNN classifer object
knn = KNeighborsClassifier()

# train decision tree classifer
knn.fit(x_train, y_train)

# predict the response for test dataset
y_predicted = knn.predict(x_test)

# check accuracy
print(f"training accuracy {knn.score(x_train, y_train)}")
print(f"testing accuracy {knn.score(x_test, y_test)}")

## Naive Bayes

In [None]:
# split into training and testing
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test, = train_test_split(x,y, test_size=0.2)

# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

# create KNN classifer object
gnb = GaussianNB()

# train decision tree classifer
gnb.fit(x_train, y_train)

# predict the response for test dataset
y_predicted = gnb.predict(x_test)

# check accuracy
print(f"training accuracy {gnb.score(x_train, y_train)}")
print(f"testing accuracy {gnb.score(x_test, y_test)}")