In [1]:
import logging
import warnings
import datetime

import pandas as pd
import seaborn as sns
import numpy as np
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
import folium
from shapely.geometry import box
from keplergl import KeplerGl
import folium
from branca import colormap as cm

warnings.filterwarnings("ignore")
logging.basicConfig(format='%(asctime)s [%(levelname).7s] %(message)s', datefmt='%d/%m %H:%M:%S', level=logging.INFO)

In [2]:
regions = pd.read_csv('../data/regions.csv', sep=';', index_col=0)

region_mapping = {}
for i, row in regions.iterrows():
    region_mapping[(row['west'], row['south'])] = i
    
bins_lon = sorted(list(set(regions['west'].unique().tolist() + regions['east'].unique().tolist())))
bins_lat = sorted(list(set(regions['south'].unique().tolist() + regions['north'].unique().tolist())))

# с 2015 года появлется ещё 1 столбец
cols_18 = ['vendor', 'pickup_time', 'dropoff_time', 'passenger_count', 'trip_distance',
           'start_lon', 'start_lat', 'rate', 'store_and_forward', 'end_lon', 'end_lat',
           'payment_type', 'fare_amount', 'surcharge', 'tax', 'tip_amount', 'tolls_amount', 'total_amount']

cols_19 = ['vendor', 'pickup_time', 'dropoff_time', 'passenger_count', 'trip_distance',
           'start_lon', 'start_lat', 'rate', 'store_and_forward', 'end_lon', 'end_lat',
           'payment_type', 'fare_amount', 'surcharge', 'tax', 'tip_amount', 'tolls_amount', 
           'improvement_surcharge', 'total_amount']

def read_data(file_path, nrows=None):
    file = file_path.replace(' ', '\ ').split('/')[-1]
    y = int(file[file.find('20'):file.find('20') + 4])
    if y < 2015:
        columns = cols_18
    else:
        columns = cols_19
    logging.info('Reading {}'.format(file))
    data = pd.read_csv(file_path, names=columns, skiprows=1, nrows=nrows,
                       parse_dates=['pickup_time', 'dropoff_time'], dayfirst=True)
    logging.info('{} shape = {}'.format(file, data.shape))
    return data

def map_region(data):
    data['start_lon_bin'] = pd.cut(data['start_lon'], bins=bins_lon,
                                   precision=25, include_lowest=True)
    data['start_lat_bin'] = pd.cut(data['start_lat'], bins=bins_lat,
                                   precision=25, include_lowest=True)
    region_id = []
    for lon, lat in zip(data['start_lon_bin'], data['start_lat_bin']):
        region_id.append(region_mapping[(lon.left, lat.left)])
    data['region_id'] = region_id  
    return data

def clear_data(data):
    res = data[(data['trip_distance'] > 0) &
               (data['passenger_count'] > 0) &
               (data['dropoff_time'] > data['pickup_time']) &
               (data['start_lon'] >= -74.25559) &
               (data['start_lon'] <= -73.70001) &
               (data['start_lat'] >= 40.49612) &
               (data['start_lat'] <= 40.91553)]
    return res

def aggregate_data(data):
    data['hour'] = data['pickup_time'].dt.hour
    data['date'] = data['pickup_time'].dt.date
    data_agg = data.groupby(['region_id', 'date', 'hour']).size().rename('count')
    min_date = data['date'].min()
    max_date = data['date'].max()
    
    data_range = pd.DataFrame(pd.date_range(min_date, max_date + datetime.timedelta(days=1), freq='1H')[:-1],
                              columns=['datetime'])
    regions_df = pd.DataFrame(list(range(1, 2501)), columns=['region_id'])
    data_range['tmp'] = 1
    regions_df['tmp'] = 1

    timeseries = pd.merge(data_range, regions_df, how='outer').drop('tmp', axis=1)
    timeseries['date'] = timeseries['datetime'].dt.date
    timeseries['hour'] = timeseries['datetime'].dt.hour
    timeseries = timeseries.set_index(['region_id', 'date', 'hour'])
    
    data_full = timeseries.join(data_agg) \
                          .fillna(0) \
                          .sort_values(by='count', ascending=False) \
                          .reset_index()
    
    return data_full

def read_and_process(file_path, nrows=None):
    logging.info('Processing {}'.format(file_path))
    data = read_data(file_path, nrows=nrows)
    logging.info('Clearing {}'.format(file_path))
    data = clear_data(data)
    logging.info('Mapping regions {}'.format(file_path))
    data = map_region(data)
    logging.info('Aggregating {}'.format(file_path))
    data = aggregate_data(data)
    logging.info('Finished {}'.format(file_path))
    return data

In [3]:
data = read_and_process('../data/yellow_tripdata_2016-05.csv', nrows=None)

14/07 17:57:38 [INFO] Processing ../data/yellow_tripdata_2016-05.csv
14/07 17:57:38 [INFO] Reading yellow_tripdata_2016-05.csv
14/07 17:58:30 [INFO] yellow_tripdata_2016-05.csv shape = (11836853, 19)
14/07 17:58:30 [INFO] Clearing ../data/yellow_tripdata_2016-05.csv
14/07 17:58:37 [INFO] Mapping regions ../data/yellow_tripdata_2016-05.csv
14/07 17:59:57 [INFO] Aggregating ../data/yellow_tripdata_2016-05.csv
14/07 18:00:23 [INFO] Finished ../data/yellow_tripdata_2016-05.csv


In [4]:
orders_per_region = data.groupby(['region_id']).agg({'count': 'sum'})['count']
print('Ячеек без поездок: {}'.format((orders_per_region == 0).sum()))

Ячеек без поездок: 1283


In [5]:
esb_coords = (-73.985594, 40.748302)

In [6]:
m = folium.Map(location=[esb_coords[1], esb_coords[0]],
               zoom_start=10,
               zoom_control=False,
               scrollWheelZoom=False,
               dragging=False
              )

folium.CircleMarker(
    location=[esb_coords[1], esb_coords[0]],
    popup=folium.Popup('Empire State Building',
                       max_width=450),
    color='#00F',
    radius=5
).add_to(m)

m

In [7]:
geo_json_data = {
    "type": "FeatureCollection",
    "features": []
}
for i, row in regions.iterrows():
    rectangle = box(row.values[0], row.values[2], row.values[1], row.values[3])
    feature = {
        "properties": {"orders": orders_per_region[i]},
        "id": i,
        "type": "Feature",
        "geometry": rectangle.__geo_interface__
    }
    geo_json_data['features'].append(feature)

In [8]:
colormap = cm.LinearColormap(
    colors=['#DFFF00', '#BFFF00', '#A7FC00', '#D1E231', '#A4C639', '#8DB600', '#808000', '#4B5320'],
    index=[0, 2, 4, 10, 22,76, 1010,200000],
    vmin=orders_per_region.min(),
    vmax=orders_per_region.max())

m = folium.Map(location=[esb_coords[1], esb_coords[0]],
               zoom_start=10,
               zoom_control=False,
               scrollWheelZoom=False,
               dragging=False,
               no_touch=True,
               max_zoom=10,
               min_zoom=10
              )

folium.GeoJson(
    geo_json_data,
    style_function=lambda feature: {
        'fillColor': colormap(feature['properties']['orders']),
        'color': 'black',
        'weight': 1,
        'dashArray': '5, 5',
        'fillOpacity': 0.8,
    }).add_to(m)
m

In [9]:
sol_coords = (-74.044502, 40.689247)

In [10]:
m = folium.Map(location=[esb_coords[1], esb_coords[0]],
               zoom_start=10)

folium.CircleMarker(
    location=[esb_coords[1], esb_coords[0]],
    popup=folium.Popup('Empire State Building',
                       max_width=450),
    color='#00F',
    radius=5
).add_to(m)

m

In [11]:
mean_orders = data.groupby('region_id').agg({'count': 'mean'})['count']

In [12]:
bins = pd.qcut(mean_orders[mean_orders > 0] , 8, duplicates='drop', retbins=True)[1]

In [13]:
colormap = cm.LinearColormap(
    colors=['#DFFF00', '#BFFF00', '#A7FC00', '#D1E231', '#A4C639', '#8DB600', '#808000', '#4B5320'],
    index=bins,
    vmin=mean_orders.min(),
    vmax=mean_orders.max())

m = folium.Map(location=[esb_coords[1], esb_coords[0]],
               zoom_start=10)

folium.GeoJson(
    geo_json_data,
    style_function=lambda feature: {
        'fillColor': colormap(mean_orders[feature['id']]),
        'color': 'black',
        'weight': 1,
        'dashArray': '5, 5',
        'fillOpacity': 0.8,
    }).add_to(m)
m

In [14]:
geo_json_data = {
    "type": "FeatureCollection",
    "features": []
}
for i, row in regions.iterrows():
    if mean_orders[i] < 5:
        continue
    rectangle = box(row.values[0], row.values[2], row.values[1], row.values[3])
    feature = {
        "properties": {"orders": mean_orders[i]},
        "id": i,
        "type": "Feature",
        "geometry": rectangle.__geo_interface__
    }
    geo_json_data['features'].append(feature)

In [15]:
print('Ячеек, где в среднем больше 5 поездок: {}'.format((mean_orders > 5).sum()))

Ячеек, где в среднем больше 5 поездок: 102


In [16]:
bins = pd.qcut(mean_orders[mean_orders > 5], 3, duplicates='drop', retbins=True)[1]

In [17]:
colormap = cm.LinearColormap(
    colors=['LightYellow', 'LightGreen', 'Green', 'DarkGreen'],
    index=bins,
    vmin=mean_orders.min(),
    vmax=mean_orders.max())

m = folium.Map(location=[esb_coords[1], esb_coords[0]],
               zoom_start=10)

folium.GeoJson(
    geo_json_data,
    style_function=lambda feature: {
        'fillColor': colormap(mean_orders[feature['id']]),
        'color': 'black',
        'weight': 1,
        'dashArray': '5, 5',
        'fillOpacity': 0.8,
    }).add_to(m)
m

In [30]:
open('../data/regions.txt', 'w').write(','.join(map(str, mean_orders[mean_orders > 5].index)))

509