In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

## Setup Train/Test Data

In [2]:
# Retrieve Training and Testing Data 
df_train = pd.read_csv('data/trainClean.csv')
df_test = pd.read_csv('data/testClean.csv')

## Setup features and target

In [3]:
X_train = df_train.drop(columns='Transported')
y_train = df_train['Transported']

## Running default DTC

In [19]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

pred = dtc.predict(df_test)
pred

array([0, 0, 1, ..., 1, 0, 0], dtype=int64)

In [20]:
tfMap = { 0: False, 1: True }

kaggleCSV = pd.DataFrame(columns=['PassengerId', 'Transported'])
kaggleCSV['PassengerId'] = df_test['PassengerId']
kaggleCSV['Transported'] = [tfMap[i] for i in pred]

kaggleCSV.to_csv('results/DTC_results_before.csv', index=False)

## Setup Grid Search

In [35]:
dtc = DecisionTreeClassifier()

In [38]:
# Parameter grid with hyperparameters to tune
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': [2, 6, 10, 14, 18],
    'min_samples_split': [2, 6, 10, 14, 18],
    'min_samples_leaf': [2, 6, 10, 14, 18],
    'min_weight_fraction_leaf': [0.1, 0.3, 0.5],
    'max_features': ['sqrt', 'log2'],
    'random_state': [2, 6, 10, 14, 18]
}

gs = GridSearchCV(estimator=dtc, param_grid=param_grid, scoring='accuracy', cv=5).fit(X_train, y_train)

In [39]:
gs_results_df = pd.DataFrame(gs.cv_results_)
gs_results_df = gs_results_df.sort_values('rank_test_score')
gs_results_df.to_csv('DTC_Results/dtc_gs_results.csv')

## Running Best DTC

In [45]:
dtc = DecisionTreeClassifier(criterion="gini", splitter="best", max_depth=14, min_samples_split=18, min_samples_leaf=18, min_weight_fraction_leaf=0.1, max_features="log2",random_state=6)
dtc.fit(X_train, y_train)

pred = dtc.predict(df_test)
pred

array([1, 0, 1, ..., 1, 0, 1], dtype=int64)

In [10]:
tfMap = { 0: False, 1: True }

kaggleCSV = pd.DataFrame(columns=['PassengerId', 'Transported'])
kaggleCSV['PassengerId'] = df_test['PassengerId']
kaggleCSV['Transported'] = [tfMap[i] for i in pred]

kaggleCSV.to_csv('results/DTC_results_after.csv', index=False)

## Running Worst DTC

In [4]:
dtc = DecisionTreeClassifier(criterion="entropy", splitter="best", max_depth=2, min_samples_split=6, min_samples_leaf=14, min_weight_fraction_leaf=0.5, max_features="sqrt",random_state=2)
dtc.fit(X_train, y_train)

pred = dtc.predict(df_test)
pred

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [5]:
tfMap = { 0: False, 1: True }

kaggleCSV = pd.DataFrame(columns=['PassengerId', 'Transported'])
kaggleCSV['PassengerId'] = df_test['PassengerId']
kaggleCSV['Transported'] = [tfMap[i] for i in pred]

kaggleCSV.to_csv('results/DTC_results_after_worst.csv', index=False)