In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# 1. Загрузка и обработка данных

Загрузим данные поездок за май 2016 года и посмотрим на них:

In [5]:
%%time
data_filename = 'yellow_tripdata_2016-05.csv'
data = pd.read_csv('Data/' + data_filename)

CPU times: user 22.3 s, sys: 1.19 s, total: 23.5 s
Wall time: 23.5 s


In [6]:
data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2016-05-01 00:00:00,2016-05-01 00:17:31,1,3.6,-73.985901,40.76804,1,N,-73.983986,40.730099,1,15.0,0.5,0.5,1.5,0.0,0.3,17.8
1,2,2016-05-01 00:00:00,2016-05-01 00:07:31,1,1.68,-73.991577,40.744751,1,N,-73.9757,40.765469,1,7.5,0.5,0.5,0.88,0.0,0.3,9.68
2,2,2016-05-01 00:00:00,2016-05-01 00:07:01,6,1.09,-73.993073,40.741573,1,N,-73.980995,40.744633,1,6.5,0.5,0.5,1.56,0.0,0.3,9.36
3,2,2016-05-01 00:00:00,2016-05-01 00:19:47,1,4.21,-73.991943,40.684601,1,N,-74.002258,40.733002,1,17.0,0.5,0.5,3.66,0.0,0.3,21.96
4,2,2016-05-01 00:00:00,2016-05-01 00:06:39,1,0.56,-74.00528,40.740192,1,N,-73.997498,40.737564,1,6.0,0.5,0.5,1.46,0.0,0.3,8.76


In [7]:
data.shape

(11836853, 19)

In [8]:
data.dropna(inplace = True)
data.shape

(11836853, 19)

Произведем фильтрацию данных: удалим поездки с нулевым временем, нулевым расстоянием, нулевым количеством пассажиров и координатами начала за пределами Нью-Йорка

In [9]:
west = -74.25559
east = -73.70001
south = 40.49612
nord = 40.91553

In [10]:
bad_indices = ((data['tpep_pickup_datetime'] == data['tpep_dropoff_datetime']) | (data['passenger_count'] == 0) |\
             (data['trip_distance'] == 0) | (data['pickup_longitude'] < west) | (data['pickup_longitude'] > east) |\
               (data['pickup_latitude'] < south) | (data['pickup_latitude'] > nord)).nonzero()
print type(bad_indices), len(bad_indices), len(bad_indices[0])

<type 'tuple'> 1 210332


In [11]:
data.drop(bad_indices[0], inplace = True)
data.shape

(11626521, 19)

In [12]:
assert (data.dropna().shape == data.shape)

In [13]:
data['tpep_pickup_datetime'] = pd.Series([(str(x))[:13] for x in data['tpep_pickup_datetime'].values], index = data.index)

In [14]:
assert (data.dropna().shape == data.shape)

In [15]:
data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2016-05-01 00,2016-05-01 00:17:31,1,3.6,-73.985901,40.76804,1,N,-73.983986,40.730099,1,15.0,0.5,0.5,1.5,0.0,0.3,17.8
1,2,2016-05-01 00,2016-05-01 00:07:31,1,1.68,-73.991577,40.744751,1,N,-73.9757,40.765469,1,7.5,0.5,0.5,0.88,0.0,0.3,9.68
2,2,2016-05-01 00,2016-05-01 00:07:01,6,1.09,-73.993073,40.741573,1,N,-73.980995,40.744633,1,6.5,0.5,0.5,1.56,0.0,0.3,9.36
3,2,2016-05-01 00,2016-05-01 00:19:47,1,4.21,-73.991943,40.684601,1,N,-74.002258,40.733002,1,17.0,0.5,0.5,3.66,0.0,0.3,21.96
4,2,2016-05-01 00,2016-05-01 00:06:39,1,0.56,-74.00528,40.740192,1,N,-73.997498,40.737564,1,6.0,0.5,0.5,1.46,0.0,0.3,8.76


In [16]:
%%time
data.to_csv('Data/Prepared/' + data_filename, sep = ',', index = False)

CPU times: user 1min 4s, sys: 1.42 s, total: 1min 5s
Wall time: 1min 9s


In [17]:
wrote = pd.read_csv('Data/Prepared/' + data_filename)
wrote.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2016-05-01 00,2016-05-01 00:17:31,1,3.6,-73.985901,40.76804,1,N,-73.983986,40.730099,1,15.0,0.5,0.5,1.5,0.0,0.3,17.8
1,2,2016-05-01 00,2016-05-01 00:07:31,1,1.68,-73.991577,40.744751,1,N,-73.9757,40.765469,1,7.5,0.5,0.5,0.88,0.0,0.3,9.68
2,2,2016-05-01 00,2016-05-01 00:07:01,6,1.09,-73.993073,40.741573,1,N,-73.980995,40.744633,1,6.5,0.5,0.5,1.56,0.0,0.3,9.36
3,2,2016-05-01 00,2016-05-01 00:19:47,1,4.21,-73.991943,40.684601,1,N,-74.002258,40.733002,1,17.0,0.5,0.5,3.66,0.0,0.3,21.96
4,2,2016-05-01 00,2016-05-01 00:06:39,1,0.56,-74.00528,40.740192,1,N,-73.997498,40.737564,1,6.0,0.5,0.5,1.46,0.0,0.3,8.76


In [18]:
assert(wrote.dropna().shape == wrote.shape and wrote.shape == data.shape)

# Агрегация данных

In [25]:
def area(lng, ltt):
    x = ((lng - west) / ((east - west) / 50.)).astype('int')
    y = ((ltt - south) / ((nord - south) / 50.)).astype('int')
    return x * 50 + y + 1

In [26]:
data['area'] = area(data['pickup_longitude'], data['pickup_latitude'])

In [None]:
from scipy import stats

In [86]:
statistic, x_edge, y_edge, binnumber = stats.binned_statistic_2d(data['pickup_longitude'], data['pickup_latitude'], None,\
statistic = 'count', bins = [np.linspace(west, east, 51), np.linspace(south, nord, 51)], expand_binnumbers = True)

In [87]:
print statistic.shape, x_edge.shape, y_edge.shape, binnumber.shape

(50, 50) (51,) (51,) (2, 11626521)


In [89]:
print statistic
print binnumber

[[ 0.  0.  0. ...,  0.  5.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
[[25 24 24 ..., 23 25 28]
 [33 30 30 ..., 27 32 11]]


In [97]:
bn = 50 * binnumber[0] + binnumber[1]
binnumber = binnumber.T
print bn, bn.min(), bn.max()
print binnumber

[1283 1230 1230 ..., 1177 1282 1411] 65 2537
[[25 33]
 [24 30]
 [24 30]
 ..., 
 [23 27]
 [25 32]
 [28 11]]


In [65]:
print data['area'].min(), data['area'].max()

15 2487


In [61]:
assert(statistic.sum() == data.shape[0])

In [33]:
ESB_coord = (-73.985126, 40.748527)
print area(np.array([ESB_coord[0]]), np.array([ESB_coord[1]]))

[1231]
