# Taxi Travel in New York City during the Christmas Holidays: An Analysis of Destinations, Trends, and Factors

In [None]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd

## 数据集来源及获取
本文主要用到了如下数据集：
1. 纽约市2017年-2021每年12月的黄色出租车和绿色出租车数据 - [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
2. 对应数据区域的shapefile - [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
3. 对应时间的天气数据（API获取） - [OpenWeather](https://openweathermap.org/api/one-call-3)

### 天气数据获取

In [None]:
# 插入获取天气数据代码

### 出租车行程数据处理
这一节旨在将出租车形成数据集计为小时车流数据，集计后的数据包括：
1. 日期和小时
2. 出发区域ID
3. 到达区域ID
4. 行程数量
5. 行程人数
6. 旅行费用
7. 载客系数（行程人数/行程数量）

In [68]:
# 插入数据集计代码

## Find the hot points
* calculate the in-and-out volumn for each zone by hours
* give the zones a rank, transfer the rank to point
* add all time points, find the hotest points.

Get the score of each point per hour.   
Because of the time complexity of bellow function is too high, I used AWS to calculate the traffic in each zone.

In [None]:
# get the score for each zone by day and hours
def get_in_out_volumn_score(df, day_list, hour_list, year):
    """
    Calculate the in-and-out volumn for each zone by day and hours, and give each zone a score based on the volumn.
    Input:
        df: a dataframe containing the counted data
        day_list: a list of days
        hour_list: a list of hours
    Output:
        df_in_out_volumn: a dataframe containing the in-and-out volumn for each zone by day and hours, and the score
    """
    # calculate the in-and-out volumn for each zone by day and hours
    ## get the location id
    location_id_list = list(range(1,264))
    ## get the day and hour
    day_list = day_list
    hour_list = hour_list
    ## get the in-and-out volumn for each zone by day and hours, and store them in a dictionary
    in_out_volumn = {}
    for location_id in location_id_list:
        for day in day_list:
            for hour in hour_list:
                in_out_volumn[(location_id, day, hour)] = [df.loc[(df['DOLocationID']==location_id) & (df['day']==day) & (df['hour']==hour), 'trip_count'].sum(), df.loc[(df['PULocationID']==location_id) & (df['day']==day) & (df['hour']==hour), 'trip_count'].sum()]
        print(f'Finish calculating in-and-out volumn for zone {location_id} of {year}.')
    # convert the dictionary to a dataframe
    df_in_out_volumn = pd.DataFrame.from_dict(in_out_volumn, orient='index', columns=['in_volumn', 'out_volumn'])
    # add day hour and location id as columns
    df_in_out_volumn['day'] = df_in_out_volumn.index.map(lambda x: x[1])
    df_in_out_volumn['hour'] = df_in_out_volumn.index.map(lambda x: x[2])
    df_in_out_volumn['location_id'] = df_in_out_volumn.index.map(lambda x: x[0])
    # add the total volumn
    df_in_out_volumn['total_volumn'] = df_in_out_volumn['in_volumn'] + df_in_out_volumn['out_volumn']
    # reorder the columns
    df_in_out_volumn = df_in_out_volumn[['day', 'hour', 'location_id', 'in_volumn', 'out_volumn', 'total_volumn']]
    # reset the index
    df_in_out_volumn = df_in_out_volumn.reset_index(drop=True)

    # calculate the rank of the total volumn for each zone by day and hours
    df_in_out_volumn['total_volumn_rank'] = df_in_out_volumn.groupby(['day', 'hour'])['total_volumn'].rank(ascending=False)
    # give the zone a score = 1/rank * number of zones
    df_in_out_volumn['total_volumn_score'] = (1/df_in_out_volumn['total_volumn_rank']) * 263
    
    return df_in_out_volumn

def plot_in_out_volumn_score(df_in_out_volumn, day_list, hour_list, year):
    """
    Plot the score for each zone by day and hours.
    Input:
        df_in_out_volumn: a dataframe containing the in-and-out volumn for each zone by day and hours, and the score
        day_list: a list of days
        hour_list: a list of hours
    Output:
        None
    """
    # plot the score for each zone by day and hours
    ## get the day and hour
    day_list = day_list
    hour_list = hour_list
    ## plot the score for each zone by day and hours
    for day in day_list:
        for hour in hour_list:
            plt.figure(figsize=(20,10))
            sns.barplot(x='location_id', y='total_volumn_score', data=df_in_out_volumn.loc[(df_in_out_volumn['day']==day) & (df_in_out_volumn['hour']==hour), :])
            plt.title('day: {}, hour: {}'.format(day, hour))
            plt.xticks([])
            plt.savefig(f'../data/figures/in_out_volumn_score{year}_{day}_{hour}.png')
            plt.close()

def get_hot_score(year_list, day_list, hour_list):
    for year in year_list:
        # read the data
        df = pd.read_csv(f'../data/processed_nyc_data/countdata_{year}-12.csv')
        # calculate the in-and-out volumn for each zone by day and hours, and give each zone a score based on the volumn
        df_in_out_volumn = get_in_out_volumn_score(df, day_list, hour_list,year)
        df_in_out_volumn.to_csv(f'../data/processed_nyc_data/in_out_volumn_score_{year}-12.csv', index=False)
        print(f'Finish getting score for {year}-12.')
        # plot the score for each zone by day and hours
        plot_in_out_volumn_score(df_in_out_volumn, day_list, hour_list, year)


In [None]:
year_list = range(2017, 2022)  # 2017-2021
day_list = range(1, 32)  # 1-31
hour_list = range(0, 24)  # 0-23
get_hot_score(year_list, day_list, hour_list)

find the hotest points for each year

In [62]:
def get_16_hotest_zone(year):
    df = pd.read_csv(f'../data/processed_nyc_data/{year}.csv')
    df['year_score'] = df.groupby('location_id')['total_volumn_score'].transform('sum')
    df['year_volumn'] = df.groupby('location_id')['total_volumn'].transform('sum')
    df_hot = df[['location_id', 'year_score','year_volumn']].drop_duplicates()
    df_hot['volumn_rank'] = df_hot['year_volumn'].rank(ascending=False)
    df_hot['score_rank'] = df_hot['year_score'].rank(ascending=False)
    df_hot = df_hot.sort_values(by='year_score', ascending=False)
    df_hot = df_hot.head(16)
    df_hot = df_hot[['location_id', 'year_score', 'score_rank', 'year_volumn', 'volumn_rank']]

    return df_hot

def plot_hot_zone(df_hot, df_geo, year):
    # merge the year score to the geodata
    df_hot_geo = df_geo.merge(df_hot, left_on='LocationID', right_on='location_id', how='right')
    plt.figure(figsize=(20,10))
    df_hot_geo.plot(column='year_score', cmap='YlOrRd', edgecolor='black', legend=True)
    plt.savefig(f'../data/figures/hot_zone_{year}.png')
    plt.close()
    return None

def get_hot_zone(year_list):
    df_hots = {}
    df_geo = gpd.read_file('../data/taxi_zones/taxi_zones.shp')
    for year in year_list:
        # get the 16 hotest zone for each year
        df_hot = get_16_hotest_zone(year)
        # plot the hot zone
        plot_hot_zone(df_hot, df_geo, year)
        # add the df_hot to a dictionary
        df_hots[year] = df_hot

    return df_hots

In [63]:
df_geo = gpd.read_file('../data/taxi_zones/taxi_zones.shp')
hot_zones = get_hot_zone(range(2017, 2022))

  if geom is not None and geom.type.startswith(prefix) and not geom.is_empty:
  if geom is not None and geom.type.startswith(prefix) and not geom.is_empty:
  if geom is not None and geom.type.startswith(prefix) and not geom.is_empty:
  if geom is not None and geom.type.startswith(prefix) and not geom.is_empty:
  if geom is not None and geom.type.startswith(prefix) and not geom.is_empty:


<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

In [None]:
# get year total_volumn for each zone
def get_year_total_volumn(df, year):
    df['year'] = year
    df['total_volumn'] = df['in_volumn'] + df['out_volumn']
    df_year = df[['year', 'location_id', 'total_volumn']]
    return df_year

In [67]:
hot_zones[2021]

Unnamed: 0,location_id,year_score,score_rank,year_volumn,volumn_rank
175584,237,82742.029103,1.0,293589.0,1.0
174840,236,76948.211696,2.0,288884.0,2.0
34968,48,61368.859444,3.0,194108.0,6.0
97464,132,50773.052077,4.0,162677.0,13.0
119040,161,47698.763971,5.0,241028.0,3.0
170376,230,41913.865828,6.0,196188.0,5.0
58032,79,35204.664157,7.0,137864.0,18.0
104904,142,34668.904505,8.0,204139.0,4.0
137640,186,29991.909295,9.0,171369.0,12.0
119784,162,24118.809119,10.0,190958.0,7.0


已经做了的：
1. 给所有区域以年为单位进行流量和热度排名,其中：
   1. 流量排名：以31天24小时流量和进行排名，这反映了其在这一个月总的流量
   2. 热度排名：以31天24小时热度分数和进行排名，这反映了其在一个月中成为热点区域的频率
2. 找到每年热点区域（用于后续时间序列预测）
3. 流量排名和热度排名的差距显示了一些地区车流量的波动性显著强于另一些地区

要做的：
1. 比较各热点区域历年排名变化（可视化）
2. 比较一天内热点变化（工作日，一般周末，圣诞假期）（可视化）
3. auto_arima

In [65]:
df_geo = df_geo.to_crs(epsg=4326)
df_geo[df_geo['LocationID']==138].centroid


  


137    POINT (-73.87363 40.77438)
dtype: geometry

## 时间定性分析

概览：
1. 比较历年热点区域变化情况
2. 比较一天内热点变化趋势（工作日、周末、圣诞节）
3. 比较同一区域的流量排名和热度排名，分类如下：
   1. 流量排名/热度排名 > 1 ：时间波动性较强
   2. 流量排名/热度排名 < 1 : 时间波动性较弱
4. 分别可视化波动性较强和较弱的区域，观察其分布


## ARIMA

概览：
1. 确定用于ARIMA分析的10个区域
2. 生成这些区域间交通流量的时间序列（5\*10\*10）（包括天气信息）
3. 对500条时间序列进行ARIMA分解，获取其总体趋势和季节性趋势
4. 总结结果