In [1]:
import numpy as np
import scipy.misc
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from tpot import TPOTClassifier

In [2]:
culled_data = pd.read_csv('culled_data.csv.gz', dtype={'line':'category', 
                                                       'vehicle':'category', 
                                                       'precipType':'category',
                                                       'late':'category',
                                                       'dayOfWeek':'category',
                                                       'hourOfDay':'category'})

In [3]:
culled_data['delay'].fillna(culled_data['delay'].mean(), inplace=True)

In [4]:
#culled_data.groupby('vehicle').delay.mean()

In [5]:
culled_data.head()

Unnamed: 0,line,delay,latitude,longitude,speed,time,vehicle,apparentTemperature,cloudCover,dewPoint,...,pressure,temperature,uvIndex,visibility,windBearing,windGust,windSpeed,late,dayOfWeek,hourOfDay
0,4,50.0,60.1688,24.80383,0.0,1507685150,2076,6.28,0.83,7.13,...,993.75,7.72,0.0,8.64,45.0,7.77,8.29,0,2,4
1,4,50.0,60.1688,24.80383,0.0,1507685150,2076,6.28,0.83,7.13,...,993.75,7.72,0.0,8.64,45.0,7.77,8.29,0,2,4
2,4,80.0,60.20982,25.07738,0.0,1507685180,4712,6.28,0.83,7.13,...,993.75,7.72,0.0,8.64,45.0,7.77,8.29,1,2,4
3,4,80.0,60.20982,25.07738,0.0,1507685180,4712,6.28,0.83,7.13,...,993.75,7.72,0.0,8.64,45.0,7.77,8.29,1,2,4
4,4,67.0,60.17083,24.81079,0.0,1507685227,2076,6.28,0.83,7.13,...,993.75,7.72,0.0,8.64,45.0,7.77,8.29,1,2,4


In [6]:
#unique_late_vals = culled_data.groupby('vehicle').late.apply(lambda x: x.nunique())
#outlier_vehicles = unique_late_vals[unique_late_vals < 2]

#culled_data = culled_data[~culled_data['vehicle'].isin(list(outlier_vehicles.index))]

In [7]:
#mean_delays = culled_data.groupby('vehicle').delay.mean()
#outlier_vehicles = mean_delays[mean_delays > 60]

#culled_data = culled_data[~culled_data['vehicle'].isin(list(outlier_vehicles.index))]

In [8]:
#mean_delays = culled_data.groupby('vehicle').delay.mean()
#outlier_vehicles = mean_delays[mean_delays < -60]

#culled_data = culled_data[~culled_data['vehicle'].isin(list(outlier_vehicles.index))]

# Split into training and test data

In [9]:
targets = culled_data['late']

# These give away the answer
data_vect = culled_data.drop(['late'], axis=1)
data_vect = data_vect.drop(['delay'], axis=1)

# Meaningless data that interferes with our models
#data_vect = data_vect.drop(['latitude'], axis=1)
#data_vect = data_vect.drop(['longitude'], axis=1)
#data_vect = data_vect.drop(['time'], axis=1)

#data_vect = data_vect.drop(['vehicle'], axis=1)

data_vect.head()

Unnamed: 0,line,latitude,longitude,speed,time,vehicle,apparentTemperature,cloudCover,dewPoint,humidity,...,precipType,pressure,temperature,uvIndex,visibility,windBearing,windGust,windSpeed,dayOfWeek,hourOfDay
0,4,60.1688,24.80383,0.0,1507685150,2076,6.28,0.83,7.13,0.96,...,1,993.75,7.72,0.0,8.64,45.0,7.77,8.29,2,4
1,4,60.1688,24.80383,0.0,1507685150,2076,6.28,0.83,7.13,0.96,...,1,993.75,7.72,0.0,8.64,45.0,7.77,8.29,2,4
2,4,60.20982,25.07738,0.0,1507685180,4712,6.28,0.83,7.13,0.96,...,1,993.75,7.72,0.0,8.64,45.0,7.77,8.29,2,4
3,4,60.20982,25.07738,0.0,1507685180,4712,6.28,0.83,7.13,0.96,...,1,993.75,7.72,0.0,8.64,45.0,7.77,8.29,2,4
4,4,60.17083,24.81079,0.0,1507685227,2076,6.28,0.83,7.13,0.96,...,1,993.75,7.72,0.0,8.64,45.0,7.77,8.29,2,4


In [10]:
data_vect.shape

(574573, 23)

In [11]:
culled_data.shape

(574573, 25)

In [12]:
training_data, test_data, train_target, test_target = train_test_split(np.array(data_vect), 
                                                                       np.array(targets), 
                                                                       train_size=0.8)
print('training_data size = ', len(training_data))
print('test_data size = ', len(test_data))



training_data size =  459658
test_data size =  114915


# Baseline (We have to beat this score)
This is the success rate if you always guess that the bus is late

In [13]:
dummy = DummyClassifier('most_frequent')
dummy.fit(training_data, train_target)
dummy.score(test_data, test_target)

0.52896488709045819

# Logistic Regression

In [14]:
model = LogisticRegression()
model.fit(training_data, train_target)
model.score(test_data, test_target)

0.52896488709045819

# Decision Trees

In [15]:
tree_model = tree.DecisionTreeClassifier()
tree_model.fit(training_data, train_target)
tree_model.score(test_data, test_target)

0.93397728756037068

In [16]:
df = pd.DataFrame(tree_model.feature_importances_, index=data_vect.columns, columns=['importance'])
df.sort_values(by='importance', ascending=False)

Unnamed: 0,importance
vehicle,0.271155
time,0.159677
longitude,0.134082
latitude,0.099188
ozone,0.031733
windSpeed,0.031116
windGust,0.030285
windBearing,0.029432
hourOfDay,0.028464
apparentTemperature,0.025696


# Random Forest

In [17]:
forest_model = RandomForestClassifier(n_estimators=100, n_jobs=4)
forest_model.fit(training_data, train_target)
forest_model.score(test_data, test_target)

0.94919723273724055

In [19]:
forest_model.decision_path

<bound method BaseForest.decision_path of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)>

In [18]:
import geopandas as gpd

In [20]:
forest_model.feature_importances_

array([ 0.05336412,  0.15818282,  0.18173865,  0.00369525,  0.15017751,
        0.23348151,  0.0194818 ,  0.01391006,  0.02062305,  0.01182585,
        0.02163887,  0.00160266,  0.00115499,  0.00092871,  0.02037312,
        0.01859003,  0.00220856,  0.00733802,  0.01842683,  0.01944217,
        0.01954881,  0.00578591,  0.0164807 ])

In [21]:
df = pd.DataFrame(forest_model.feature_importances_, index=data_vect.columns, columns=['importance'])
df.sort_values(by='importance', ascending=False)

Unnamed: 0,importance
vehicle,0.233482
longitude,0.181739
latitude,0.158183
time,0.150178
line,0.053364
ozone,0.021639
dewPoint,0.020623
pressure,0.020373
windSpeed,0.019549
apparentTemperature,0.019482
