In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# 1. Загрузка и обработка данных

Загрузим данные поездок за май 2016 года и посмотрим на них:

In [53]:
%%time
data_filename = 'yellow_tripdata_2016-06.csv'
data = pd.read_csv('Data/' + data_filename)

CPU times: user 21 s, sys: 1.55 s, total: 22.6 s
Wall time: 22.6 s


In [54]:
data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2016-06-09 21:06:36,2016-06-09 21:13:08,2,0.79,-73.98336,40.760937,1,N,-73.977463,40.753979,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3
1,2,2016-06-09 21:06:36,2016-06-09 21:35:11,1,5.22,-73.98172,40.736668,1,N,-73.981636,40.670242,1,22.0,0.5,0.5,4.0,0.0,0.3,27.3
2,2,2016-06-09 21:06:36,2016-06-09 21:13:10,1,1.26,-73.994316,40.751072,1,N,-74.004234,40.742168,1,6.5,0.5,0.5,1.56,0.0,0.3,9.36
3,2,2016-06-09 21:06:36,2016-06-09 21:36:10,1,7.39,-73.982361,40.773891,1,N,-73.929466,40.85154,1,26.0,0.5,0.5,1.0,0.0,0.3,28.3
4,2,2016-06-09 21:06:36,2016-06-09 21:23:23,1,3.1,-73.987106,40.733173,1,N,-73.985909,40.766445,1,13.5,0.5,0.5,2.96,0.0,0.3,17.76


In [55]:
data.shape

(11135470, 19)

In [56]:
data.dropna(inplace = True)
data.shape

(11135470, 19)

Произведем фильтрацию данных: удалим поездки с нулевым временем, нулевым расстоянием, нулевым количеством пассажиров и координатами начала за пределами Нью-Йорка

In [57]:
west = -74.25559
east = -73.70001
south = 40.49612
nord = 40.91553

In [58]:
bad_indices = ((data['tpep_pickup_datetime'] == data['tpep_dropoff_datetime']) | (data['passenger_count'] == 0) |\
             (data['trip_distance'] == 0) | (data['pickup_longitude'] < west) | (data['pickup_longitude'] > east) |\
               (data['pickup_latitude'] < south) | (data['pickup_latitude'] > nord)).nonzero()
print type(bad_indices), len(bad_indices), len(bad_indices[0])

<type 'tuple'> 1 199424


In [59]:
data.drop(bad_indices[0], inplace = True)
data.shape

(10936046, 19)

In [60]:
assert (data.dropna().shape == data.shape)

In [61]:
s = pd.Series([(str(x))[:13] for x in data['tpep_pickup_datetime'].values], index = data.index)
print s.shape, data.shape, s.dropna().shape

(10936046,) (10936046, 19) (10936046,)


In [62]:
%%time
data['tpep_pickup_datetime'] = s

CPU times: user 250 ms, sys: 94 µs, total: 250 ms
Wall time: 248 ms


In [63]:
assert (data.dropna().shape == data.shape)

In [64]:
data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2016-06-09 21,2016-06-09 21:13:08,2,0.79,-73.98336,40.760937,1,N,-73.977463,40.753979,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3
1,2,2016-06-09 21,2016-06-09 21:35:11,1,5.22,-73.98172,40.736668,1,N,-73.981636,40.670242,1,22.0,0.5,0.5,4.0,0.0,0.3,27.3
2,2,2016-06-09 21,2016-06-09 21:13:10,1,1.26,-73.994316,40.751072,1,N,-74.004234,40.742168,1,6.5,0.5,0.5,1.56,0.0,0.3,9.36
3,2,2016-06-09 21,2016-06-09 21:36:10,1,7.39,-73.982361,40.773891,1,N,-73.929466,40.85154,1,26.0,0.5,0.5,1.0,0.0,0.3,28.3
4,2,2016-06-09 21,2016-06-09 21:23:23,1,3.1,-73.987106,40.733173,1,N,-73.985909,40.766445,1,13.5,0.5,0.5,2.96,0.0,0.3,17.76


In [65]:
%%time
data.to_csv('Data/Prepared/' + data_filename, sep = ',', index = False)

CPU times: user 1min, sys: 1.43 s, total: 1min 2s
Wall time: 1min 5s


In [66]:
wrote = pd.read_csv('Data/Prepared/' + data_filename)
wrote.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2016-06-09 21,2016-06-09 21:13:08,2,0.79,-73.98336,40.760937,1,N,-73.977463,40.753979,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3
1,2,2016-06-09 21,2016-06-09 21:35:11,1,5.22,-73.98172,40.736668,1,N,-73.981636,40.670242,1,22.0,0.5,0.5,4.0,0.0,0.3,27.3
2,2,2016-06-09 21,2016-06-09 21:13:10,1,1.26,-73.994316,40.751072,1,N,-74.004234,40.742168,1,6.5,0.5,0.5,1.56,0.0,0.3,9.36
3,2,2016-06-09 21,2016-06-09 21:36:10,1,7.39,-73.982361,40.773891,1,N,-73.929466,40.85154,1,26.0,0.5,0.5,1.0,0.0,0.3,28.3
4,2,2016-06-09 21,2016-06-09 21:23:23,1,3.1,-73.987106,40.733173,1,N,-73.985909,40.766445,1,13.5,0.5,0.5,2.96,0.0,0.3,17.76


In [67]:
assert(wrote.dropna().shape == wrote.shape and wrote.shape == data.shape)