In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import matplotlib.path as mplPath
#import rtree
#import fiona.crs
#import geopandas as gpd
#import pyproj
#import shapely.geometry as geom
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
%matplotlib inline

In [2]:
january2013=pd.read_csv("datasets/yellow_tripdata_2013-01.csv")

In [3]:
january2013 = january2013.loc[(january2013['pickup_longitude'] > -74.06) & (january2013['pickup_longitude'] < -73.77) & (january2013['pickup_latitude'] > 40.61) &  (january2013['pickup_latitude'] < 40.91)]
#january2013 = january2013.loc[(january2013['passenger_count'] > 0) & (january2013['passenger_count'] < 7)]
january2013 = january2013.reset_index()
january2013 = january2013.drop('index', 1)
january2013 = january2013.drop('vendor_id',1)
january2013 = january2013.drop('rate_code',1)
january2013 = january2013.drop('store_and_fwd_flag',1)
january2013 = january2013.drop('fare_amount',1)
january2013 = january2013.drop('surcharge',1)
january2013 = january2013.drop('mta_tax',1)
january2013 = january2013.drop('tolls_amount',1)
january2013 = january2013.drop('dropoff_latitude',1)
january2013 = january2013.drop('dropoff_longitude',1)
january2013 = january2013.drop('passenger_count',1)
january2013 = january2013.drop('payment_type',1)

In [4]:
def add_data(df):
    df_timestamp = pd.to_datetime(pd.Series(df['pickup_datetime']))
    df['trip_distance']*0.621371 # convert to miles
    df['weekday'] = df_timestamp.dt.weekday_name
    #df['month'] = df_timestamp.dt.month
    df['hour'] = df_timestamp.dt.hour
    #df['day'] = df_timestamp.dt.day
    #df['minutes'] = (df_timestamp.dt.hour)*60 + df_timestamp.dt.minute
    time_spent = pd.to_datetime(df['dropoff_datetime']) - pd.to_datetime(df['pickup_datetime'])
    df['time_spent'] = pd.to_datetime(time_spent).dt.minute
    df['pickup'] = df['pickup_latitude'].map(str) +','+df['pickup_longitude'].map(str)
    return df

In [5]:
january2013 = add_data(january2013)

In [6]:
january2013 = january2013.drop('pickup_datetime',1)
january2013 = january2013.drop('dropoff_datetime',1)
january2013 = january2013.drop('pickup_longitude',1)
january2013 = january2013.drop('pickup_latitude',1)

In [7]:
january2013.head()

Unnamed: 0,trip_distance,tip_amount,total_amount,weekday,hour,time_spent,pickup
0,1.0,0.0,7.0,Tuesday,15,6,"40.757977,-73.978165"
1,1.5,0.0,7.0,Sunday,0,4,"40.731781,-74.00668"
2,1.1,0.0,7.0,Saturday,18,4,"40.73777,-74.004711"
3,0.7,0.0,6.0,Monday,23,4,"40.759945,-73.97459999999998"
4,2.1,0.0,10.5,Monday,23,9,"40.748528,-73.976252"


In [8]:
# Look into the dataframe 
# First by weekday, then hour then block
# and we will know the average tip amount
# for each weekday > hour > block
# eg. On (day of week) at (hour) on (lat,long) avg tip is $number
def get_avg_tips(df):
    avg_tips = df.groupby(['weekday','hour','pickup']).mean()
    avg_tips = avg_tips.reset_index()
    return avg_tips

In [9]:
january2013 = get_avg_tips(january2013)

In [10]:
january2013.shape

(14256773, 7)

In [11]:
january2013.head()

Unnamed: 0,weekday,hour,pickup,trip_distance,tip_amount,total_amount,time_spent
0,Friday,0,"40.613763,-73.972592",0.0,7.7,38.5,0.0
1,Friday,0,"40.615297,-73.96423699999998",12.22,0.0,36.5,27.0
2,Friday,0,"40.61622,-73.97454999999998",3.7,1.625,16.375,13.0
3,Friday,0,"40.616755,-73.944947",1.05,0.0,6.0,3.0
4,Friday,0,"40.617259,-73.96366999999998",1.4,0.0,7.5,4.0


In [13]:
# Create indicator variables for the hours and days of the week
#and drop the categorical values
Xnames = ['weekday','hour','pickup','trip_distance',
          'tip_amount','total_amount','time_spent']
X = january2013[Xnames]
X = X.join(pd.get_dummies(X['hour']))
X = X.join(pd.get_dummies(X['weekday']))
X = X.drop(['hour','weekday'], axis=1)

# Split off the target (which will be the logarithm of the number of pickups (+1))
y = january2013['tip_amount']

In [14]:
itrain, itest = train_test_split(range(january2013.shape[0]), train_size=0.8)
mask=np.ones(january2013.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)
mask[:10]

array([ True, False,  True,  True,  True,  True,  True, False,  True,  True], dtype=bool)

In [15]:
Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
n_samples = Xtrain.shape[0]
n_features = Xtrain.shape[1]
Xtrain.head()

Unnamed: 0,pickup,trip_distance,tip_amount,total_amount,time_spent,0,1,2,3,4,...,21,22,23,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,"40.613763,-73.972592",0.0,7.7,38.5,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"40.61622,-73.97454999999998",3.7,1.625,16.375,13.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"40.616755,-73.944947",1.05,0.0,6.0,3.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"40.617259,-73.96366999999998",1.4,0.0,7.5,4.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"40.617353,-73.944545",0.88,0.0,6.0,3.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Create a k-Nearest Neighbors Regression estimator
knn_estimator = KNeighborsRegressor()

In [None]:
Xtrain_mean = Xtrain.mean()
Xtrain_std_dev = Xtrain.std()
Xtrain_normalized = (Xtrain - Xtrain_mean)/Xtrain_std_dev
Xtest_normalized = (Xtest - Xtrain_mean)/Xtrain_std_dev