In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('hackathon.csv')

In [None]:
df

In [None]:
df['Revenue'] = df['Revenue'].map({'Yes': 1, 'No': 0})
df['Weekend'] = df['Weekend'].map({'Yes': 1, 'No': 0})

# Checking missing values

In [None]:
df.isnull().mean()

# Exploring categorical variables

In [None]:
object_columns = [column for column in df.columns if df[column].dtype == np.dtype('object')]
object_columns

In [None]:
for column in object_columns:
    display(df[column].value_counts())

<font color='red'><b>Question:</b></font> In which month, is "Revenue" most likely abd least likely to be positive, respectively?

In [None]:
df[['Month', 'Revenue']].groupby('Month').mean()

<font color='red'><b>Question:</b></font> For which visitor type, is "Revenue" most likely abd least likely to be positive, respectively?

In [None]:
df[['VisitorType', 'Revenue']].groupby('VisitorType').mean()

# Exploring numeric variables

In [None]:
numeric_columns = df.describe().columns[:-1]
numeric_columns

<font color='red'><b>Question:</b></font> Is there any variable that shows good linear separability to differentiate "Revenue"? If so, which?

In [None]:
# We can make a conditional box plot for each variable against the label

column = numeric_columns[7]
labels = [0, 1]
y = [df[column][df['Revenue'] == label] for label in labels]

plt.boxplot(y, labels=labels)
plt.title(column)
plt.show()

In [None]:
# We can make a conditional box plot for each variable against the label

column = numeric_columns[7]

plt.scatter(df[column], df['Revenue'])
plt.title(column)
plt.show()

# One-hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
raw_column_names = ['Month', 'VisitorType']
raw_columns = df[raw_column_names]

In [None]:
enc = OneHotEncoder(drop='first')
enc.fit(raw_columns)
encoded_columns = enc.transform(raw_columns).toarray()
encoded_column_names = enc.get_feature_names_out()

In [None]:
df[encoded_column_names] = encoded_columns

# Prepare features and label
Use the first 10000 rows as the training set and the remaining rows as the test set

In [None]:
train_df = df[df['Set'] == 'train']
test_df = df[df['Set'] == 'test']

In [None]:
label = 'Revenue'
excluded_features = [label, 'Set'] + raw_column_names
features = [feature for feature in df.columns if feature not in excluded_features]

In [None]:
train_x = train_df[features]
train_y = train_df[label]

test_x = test_df[features]
test_y = test_df[label]

In [None]:
train_x

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics

In [None]:
# Build model
model = DecisionTreeClassifier(criterion='gini', max_depth=2, min_samples_split=100)

# Fit model on training data
model.fit(train_x, train_y)

# Visualize the decision tree
feature_names = train_x.columns.tolist()
plt.figure(figsize=(8, 5))
plot_tree(model, filled=True, feature_names=feature_names)
plt.show()

In [None]:
from sklearn.tree import export_text

In [None]:
print(export_text(model, feature_names=feature_names, show_weights=True))

In [None]:
train_yhat = model.predict(train_x)
test_yhat = model.predict(test_x)

In [None]:
metrics.f1_score(train_y, train_yhat)

In [None]:
metrics.f1_score(test_y, test_yhat)

In [None]:
dt_f1_df = []

for i in range(1, 16):
    model = DecisionTreeClassifier(criterion='gini', max_depth=i, min_samples_split=200)
    model.fit(train_x, train_y)
    
    train_yhat = model.predict(train_x)
    test_yhat = model.predict(test_x)

    train_f1 = metrics.f1_score(train_y, train_yhat)
    test_f1 = metrics.f1_score(test_y, test_yhat)
    
    dt_f1_df.append([i, train_f1, test_f1])

dt_f1_df = pd.DataFrame(dt_f1_df, columns=['max_depth', 'train_f1', 'test_f1'])
dt_f1_df

In [None]:
plt.plot(dt_f1_df['max_depth'], dt_f1_df['train_f1'])
plt.plot(dt_f1_df['max_depth'], dt_f1_df['test_f1'])

# k-nearest algorithm

In [None]:
x = [1,2,3,4,5,6,7,6,5,4,3,2]
y = [3,4,5,6,5,4,3,2,1,0,1,2]
plt.scatter(x, y)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
train_x

In [None]:
from sklearn.preprocessing import MinMaxScaler

normalizer = MinMaxScaler()

normalizer.fit(train_x)

train_x_norm = normalizer.transform(train_x)
test_x_norm = normalizer.transform(test_x)

In [None]:
knn_f1_df = []

for k in [5, 7, 9, 15, 25]:
    model1 = KNeighborsClassifier(n_neighbors=k)
    model1.fit(train_x.values, train_y)
    
    train_yhat = model1.predict(train_x.values)
    test_yhat = model1.predict(test_x.values)

    model1_train_f1 = metrics.f1_score(train_y, train_yhat)
    model1_test_f1 = metrics.f1_score(test_y, test_yhat)
    
    model2 = KNeighborsClassifier(n_neighbors=k)
    model2.fit(train_x_norm, train_y)

    train_yhat_norm = model2.predict(train_x_norm)
    test_yhat_norm = model2.predict(test_x_norm)
    
    model2_train_f1 = metrics.f1_score(train_y, train_yhat_norm)
    model2_test_f1 = metrics.f1_score(test_y, test_yhat_norm)

    knn_f1_df.append([k, model1_train_f1, model1_test_f1, model2_train_f1, model2_test_f1])
    
knn_f1_df = pd.DataFrame(knn_f1_df, columns=['k', 'model1_train_f1_score', 'model1_test_f1_score', 'model2_train_f1_score', 'model2_test_f1_score'])
knn_f1_df

# Naive Bayes

In [None]:
feature_sets = [['PageValues', 'SpecialDay', 'OperatingSystems', 'Region', 'TrafficType'],
                ['Administrative', 'SpecialDay', 'OperatingSystems', 'Region', 'TrafficType'],
                ['Administrative', 'PageValues', 'OperatingSystems', 'Region', 'TrafficType'],
                ['Administrative', 'PageValues', 'SpecialDay', 'Region', 'TrafficType'],
                ['Administrative', 'PageValues', 'SpecialDay', 'OperatingSystems', 'TrafficType'],
                ['Administrative', 'PageValues', 'SpecialDay', 'OperatingSystems', 'Region'],
                ['Administrative', 'PageValues', 'SpecialDay', 'OperatingSystems', 'Region', 'TrafficType'],
               ]

In [None]:
from sklearn.naive_bayes import CategoricalNB

In [None]:
nb_f1_df = []

for i in range(len(feature_sets)):
    feature_set = feature_sets[i]
    
    train_x_nb = train_x[feature_set]
    test_x_nb = test_x[feature_set]
    
    model = CategoricalNB()
    
    model.fit(train_x_nb, train_y)
    
    train_yhat = model.predict(train_x_nb)
    test_yhat = model.predict(test_x_nb)
    
    train_f1 = metrics.f1_score(train_y, train_yhat)
    test_f1 = metrics.f1_score(test_y, test_yhat)
    
    nb_f1_df.append([i+1, train_f1, test_f1])
    
nb_f1_df = pd.DataFrame(nb_f1_df, columns=['Feature Set', 'train_f1', 'test_f1'])
nb_f1_df

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(train_x, train_y)

train_yhat = model.predict(train_x)
test_yhat = model.predict(test_x)

train_f1 = metrics.f1_score(train_y, train_yhat)
test_f1 = metrics.f1_score(test_y, test_yhat)

train_f1, test_f1

# Best Model

In [None]:
model = DecisionTreeClassifier(criterion='gini', max_depth=1, min_samples_split=200)
model.fit(train_x, train_y)

train_yhat = model.predict(train_x)
test_yhat = model.predict(test_x)

train_f1 = metrics.f1_score(train_y, train_yhat)
test_f1 = metrics.f1_score(test_y, test_yhat)

train_f1, test_f1