In [1]:
%matplotlib qt 
import datetime
import requests
import json
import pickle

import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt

from decision_tree import DecisionTree
from judge import Judge

In [2]:
def custom_scatter(x, y):
    plt.figure(figsize=(15, 15))
    plt.plot(
        x, y,
        color='black',
        marker='.',
        linestyle='none',
        alpha=0.1
    )

    plt.show()

def create_features(datestrs):
    feature_data = []
    for datestr in datestrs:
        current_date = datetime.datetime.strptime(datestr, '%Y-%m-%d').date()

        current_weekday = current_date.weekday()
        day_of_week = np.zeros(7)
        day_of_week[current_weekday] = 1

        current_month = current_date.month
        month_of_year = np.zeros(12)
        # Adjust months to January = 0
        month_of_year[current_month -1] = 1

        # Season 0 = winter, 1 = spring, 2 = summer, 3 = autumn
        season = np.zeros(4)
        if current_month <= 2:
            season[0] = 1
        elif current_month <= 5:
            season[1] = 1
        elif current_month <= 8:
            season[2] = 1
        elif current_month <= 11:
            season[3] = 1
        else:
            season[0] = 1

        feature_set = {
            'Saturday': day_of_week[5],
            'Sunday': day_of_week[6],
            'winter': season[0],
            'spring': season[1],
            'summer': season[2],
            'autumn': season[3],
        }
        
        feature_data.append(feature_set)

    features = pd.DataFrame(data=feature_data, index=datestrs)
    return features

In [3]:
with open('trips.pickle', 'rb') as f:
    trips = pickle.load(f)

In [4]:
trips_df = pd.DataFrame(trips)
trips_df.head()

Unnamed: 0,dep,arr
0,2015-05-01 06:39:48,2015-05-01 07:03:15
1,2015-05-01 06:43:36,2015-05-01 07:08:41
2,2015-05-01 06:51:15,2015-05-01 07:15:16
3,2015-05-01 06:59:46,2015-05-01 07:25:21
4,2015-05-01 07:08:17,2015-05-01 07:32:14


In [5]:
arrival_times_df = pd.read_pickle('arrival_times.pickle')
arrival_times_df.head()

Unnamed: 0,2015-05-01,2015-05-02,2015-05-03,2015-05-04,2015-05-05,2015-05-06,2015-05-07,2015-05-08,2015-05-09,2015-05-10,...,2018-04-18,2018-04-19,2018-04-20,2018-04-23,2018-04-24,2018-04-25,2018-04-26,2018-04-27,2018-04-30,2018-05-01
-60,-22.0,-26.0,-18.0,-24.0,-23.0,-22.0,-23.0,-21.0,-26.0,-18.0,...,-13.0,-20.0,-21.0,-8.0,-15.0,-17.0,-23.0,-22.0,0.0,-21.0
-59,-22.0,-26.0,-18.0,-24.0,-23.0,-22.0,-23.0,-21.0,-26.0,-18.0,...,-13.0,-20.0,-21.0,-8.0,-15.0,-17.0,-23.0,-22.0,0.0,-21.0
-58,-22.0,-26.0,-18.0,-15.0,-23.0,-22.0,-23.0,-21.0,-26.0,-18.0,...,-13.0,-20.0,-13.0,-8.0,-15.0,-17.0,-23.0,-22.0,0.0,-11.0
-57,-22.0,-26.0,-18.0,-15.0,-23.0,-22.0,-9.0,-21.0,-26.0,-18.0,...,-13.0,-20.0,-13.0,-8.0,-15.0,-17.0,-15.0,-22.0,0.0,-11.0
-56,-22.0,-26.0,-18.0,-15.0,-17.0,-22.0,-9.0,-21.0,-26.0,-18.0,...,-13.0,-10.0,-13.0,-8.0,-15.0,-7.0,-15.0,-11.0,0.0,-11.0


In [6]:
features_df = create_features(list(arrival_times_df.columns))
features_df.head()

# Advice from Brandon on features:
# Use as few features as you can get away with. More features means of course that our model might get 
# more sources of variability and might give us better predictions. These infos might have an influence on 
# what we are trying to predict. But, there is a cost to this. The more features we have, the more data we need
# to avoid overfitting. 

# If a model with 100 features is only performing slightly better than a model with 10 feautures, that's no good
# because the cost of training and maintaining the model is also increased by a lot.

# More features also often mean that we need more sources for data.

Unnamed: 0,Saturday,Sunday,winter,spring,summer,autumn
2015-05-01,0.0,0.0,0.0,1.0,0.0,0.0
2015-05-02,1.0,0.0,0.0,1.0,0.0,0.0
2015-05-03,0.0,1.0,0.0,1.0,0.0,0.0
2015-05-04,0.0,0.0,0.0,1.0,0.0,0.0
2015-05-05,0.0,0.0,0.0,1.0,0.0,0.0


In [7]:
# Make last adjustment to the arrival_times_df (30 = 9.30 am, or 30 mins late)
arrival_times_df.fillna(value=30, inplace=True)

In [8]:
judge = Judge(arrival_times_df)

In [9]:
tree = DecisionTree(err_fn=judge.find_total_absolute_deviation)
tree.train(features_df)

In [10]:
tree.render()

  feature_name = self.feature_names[node.split_feature]
