In [1]:
import numpy as np
import scipy.misc
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from tpot import TPOTClassifier

In [2]:
culled_data = pd.read_csv('culled_data.csv.gz', dtype={'line':'category', 'vehicle':'category', 'precipType':'category'})

In [3]:
culled_data['line'] = culled_data['line'].cat.codes
culled_data['vehicle'] = culled_data['vehicle'].cat.codes
culled_data['precipType'] = culled_data['precipType'].cat.codes

In [4]:
culled_data['latitude'].fillna(culled_data['latitude'].mean(), inplace=True)
culled_data['longitude'].fillna(culled_data['longitude'].mean(), inplace=True)
culled_data['visibility'].fillna(culled_data['visibility'].mean(), inplace=True)
culled_data['delay'].fillna(culled_data['delay'].mean(), inplace=True)

# Split into training and test data

In [26]:
targets = culled_data['late']

# These give away the answer
data_vect = culled_data.drop(['late'], axis=1)
data_vect = data_vect.drop(['delay'], axis=1)

# Meaningless data that interferes with our models
data_vect = data_vect.drop(['latitude'], axis=1)
data_vect = data_vect.drop(['longitude'], axis=1)
data_vect = data_vect.drop(['time'], axis=1)

data_vect.head()

Unnamed: 0,line,speed,vehicle,apparentTemperature,cloudCover,dewPoint,humidity,ozone,precipIntensity,precipProbability,precipType,pressure,temperature,uvIndex,visibility,windBearing,windGust,windSpeed,dayOfWeek,hourOfDay
0,1,0.0,1127,6.28,0.83,7.13,0.96,315.98,0.0051,0.27,1,993.75,7.72,0.0,8.64,45.0,7.77,8.29,2,4
1,1,0.0,1127,6.28,0.83,7.13,0.96,315.98,0.0051,0.27,1,993.75,7.72,0.0,8.64,45.0,7.77,8.29,2,4
2,1,0.0,2514,6.28,0.83,7.13,0.96,315.98,0.0051,0.27,1,993.75,7.72,0.0,8.64,45.0,7.77,8.29,2,4
3,1,0.0,2514,6.28,0.83,7.13,0.96,315.98,0.0051,0.27,1,993.75,7.72,0.0,8.64,45.0,7.77,8.29,2,4
4,1,0.0,1127,6.28,0.83,7.13,0.96,315.98,0.0051,0.27,1,993.75,7.72,0.0,8.64,45.0,7.77,8.29,2,4


In [27]:
training_data, test_data, train_target, test_target = train_test_split(np.array(data_vect), np.array(targets), train_size=0.1)
print('training_data size = ', len(training_data))
print('test_data size = ', len(test_data))

training_data size =  34072
test_data size =  306651




# Baseline (We have to beat this score)
This is the success rate if you always guess that the bus is late

In [28]:
dummy = DummyClassifier('most_frequent')
dummy.fit(training_data, train_target)
dummy.score(test_data, test_target)

0.75792350261372043

# Logistic Regression

In [29]:
model = LogisticRegression()
model.fit(training_data, train_target)
model.score(test_data, test_target)

0.76658155362284808

In [89]:
# Fit with standardized parameters
model.fit(training_data / np.std(training_data, 0), train_target)

coefficients = list(model.coef_[0])

magnitudes = []
for h in coefficients:
    magnitudes.append(h*h)
    
df = pd.DataFrame(coefficients, index=data_vect.columns, columns=['coefficient'])
df['magnitude'] = magnitudes

df.sort_values(by='magnitude', ascending=False)

Unnamed: 0,coefficient,magnitude
speed,-0.316824,0.1003773
hourOfDay,0.187639,0.03520841
temperature,-0.175835,0.03091809
windGust,0.174056,0.03029556
dayOfWeek,0.163291,0.02666411
humidity,0.132286,0.01749951
precipProbability,0.114924,0.01320754
uvIndex,0.094615,0.008952089
dewPoint,-0.085327,0.007280763
ozone,-0.082151,0.006748787


# Decision Trees

In [8]:
tree_model = tree.DecisionTreeClassifier()
tree_model.fit(training_data, train_target)
tree_model.score(test_data, test_target)

0.85621835791327316

In [86]:
df = pd.DataFrame(tree_model.feature_importances_, index=data_vect.columns, columns=['importance'])
df.sort_values(by='importance', ascending=False)

Unnamed: 0,importance
vehicle,0.449046
speed,0.0704
ozone,0.050903
hourOfDay,0.050763
line,0.042297
windBearing,0.041408
windSpeed,0.040601
pressure,0.037224
apparentTemperature,0.035833
windGust,0.033929


# Random Forest

In [9]:
forest_model = RandomForestClassifier()
forest_model.fit(training_data, train_target)
forest_model.score(test_data, test_target)

0.85613031036759846

# TPOT
pip install tpot

In [16]:
tpot = TPOTClassifier(generations=3, population_size=5, verbosity=2, n_jobs=4, max_eval_time_mins=1)
tpot.fit(training_data, train_target)
print(tpot.score(test_data, test_target))



Optimization Progress:  60%|██████    | 12/20 [02:00<06:21, 47.67s/pipeline]

Generation 1 - Current best internal CV score: 0.7681088329430948


Optimization Progress:  85%|████████▌ | 17/20 [02:50<01:32, 30.73s/pipeline]

Generation 2 - Current best internal CV score: 0.7727166997470358


                                                                            

Generation 3 - Current best internal CV score: 0.7727166997470358

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.35, min_samples_leaf=12, min_samples_split=16, n_estimators=100)
0.771897042566
