In [1]:
import matplotlib.pyplot as plt
from matplotlib import cm
import pandas as pd
import os

In [5]:
dataset = pd.read_csv('intersection_aggregated.csv')
dataset.head()

Unnamed: 0,location,x_coordinate,y_coordinate,street_1,street_2,dayofmonth,dayofweek,minchunk,nrides
0,POINT(-73.8704944 40.7736351),-73.870494,40.773635,Departures,Central Terminal Drive,5,6,85,35
1,POINT(-74.0010659 40.7570801),-74.001066,40.75708,West 36th Street,11th Avenue,11,0,87,27
2,POINT(-73.8704944 40.7736351),-73.870494,40.773635,Departures,Central Terminal Drive,26,6,90,25
3,POINT(-73.8704944 40.7736351),-73.870494,40.773635,Departures,Central Terminal Drive,12,6,90,24
4,POINT(-73.8704944 40.7736351),-73.870494,40.773635,Departures,Central Terminal Drive,17,6,90,23


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3643048 entries, 0 to 3643047
Data columns (total 9 columns):
location        object
x_coordinate    float64
y_coordinate    float64
street_1        object
street_2        object
dayofmonth      int64
dayofweek       int64
minchunk        int64
nrides          int64
dtypes: float64(2), int64(4), object(3)
memory usage: 250.1+ MB


In [10]:
# TRAINING

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# split dataset into (train, test)
def split(dataset, train_fraction):
    msk = np.random.rand(len(dataset)) < train_fraction
    return dataset[msk], dataset[~msk]

train, test = split(dataset, .7)
features = ['x_coordinate', 'y_coordinate', 'dayofmonth', 'dayofweek', 'minchunk']


In [19]:
# models
randf = RandomForestClassifier(n_estimators=40)
randf.fit(train[features], train['nrides'])
print(randf.feature_importances_)

[0.15770212 0.16241711 0.30221429 0.07910128 0.29856519]


In [16]:
# expected
expect = test['nrides']

# predictions
randf_pred = randf.predict(test[features])

In [17]:
confusion = pd.crosstab(expect, randf_pred, rownames=['Actual'], colnames=['Predicted'])
confusion

Predicted,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,20,23
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,906897,26384,4299,1095,368,137,50,32,17,10,10,4,2,0,4,2,2,0,0,0
2,105884,9435,2261,693,288,130,45,36,15,6,8,4,3,1,0,0,0,0,0,0
3,18613,3447,1156,449,182,109,50,40,13,9,1,0,0,2,1,1,0,1,0,0
4,4326,1343,557,232,151,70,49,23,6,9,6,2,2,2,1,1,0,0,0,0
5,1255,504,298,158,90,67,28,17,15,6,5,2,2,0,0,1,0,0,0,0
6,411,232,127,90,55,34,29,14,11,8,2,0,2,1,0,1,0,0,0,0
7,146,101,71,48,42,22,24,10,7,4,1,0,0,1,0,0,0,0,0,0
8,85,66,49,25,23,18,10,9,7,6,4,3,0,1,2,2,0,1,0,0
9,45,21,20,20,18,12,10,9,4,1,1,3,1,1,1,0,0,0,0,0
10,20,12,6,15,12,5,9,9,9,5,0,3,0,1,1,0,0,0,0,0


In [18]:
from sklearn.metrics import mean_squared_error, accuracy_score

expected = np.array(expect)

print("RANDF mean_squared_error: {}".format(mean_squared_error(expected, randf_pred)))
print("RANDF accuracy: {}".format(accuracy_score(expected, randf_pred)))
print()

RANDF mean_squared_error: 0.3723321965090656
RANDF accuracy: 0.8392606680137882

