In [2]:
'''
This code is inspired by https://github.com/GiuseppeMoscarelli/Tesi_Associative_Classifier
'''
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import datetime as dt
import dateutil
import seaborn as sns


Bad key "text.kerning_factor" on line 4 in
/opt/anaconda3/envs/bigdatalab_cpu_202101/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.2/matplotlibrc.template
or from the matplotlib source distribution


In [3]:
filtered_status_dir = "../../Data/filtered_status.csv"
stations_path = "../../Data/station.csv"
trips_path = "../../Data/trip_graph.csv"

stations_df = pd.read_csv(stations_path)
trips_df = pd.read_csv(trips_path)
status_df = pd.read_csv(filtered_status_dir, parse_dates=['time'])

In [4]:
#divide the dataset in time slots

slots = dict()
slots["21-5"] = status_df.set_index('time').between_time("21:00", "5:00", include_end=False).reset_index()
slots["5-10"] = status_df.set_index('time').between_time("5:00", "10:00", include_end=False).reset_index()
slots["10-15"] = status_df.set_index('time').between_time("10:00", "15:00", include_end=False).reset_index()
slots["15-21"] = status_df.set_index('time').between_time("15:00", "22:00", include_end=False).reset_index()

In [4]:
print(len(slots["21-5"]))
print(len(slots["5-10"]))
print(len(slots["10-15"]))
print(len(slots["15-21"]))

23996440
14988752
14992265
21004423


In [8]:
slots["21-5"]

Unnamed: 0,time,station_id,bikes_available,docks_available
0,2013-08-29 21:01:02,2,1,26
1,2013-08-29 21:02:02,2,1,26
2,2013-08-29 21:03:01,2,1,26
3,2013-08-29 21:04:01,2,1,26
4,2013-08-29 21:05:01,2,1,26
...,...,...,...,...
23996435,2015-08-31 23:55:02,84,8,7
23996436,2015-08-31 23:56:01,84,8,7
23996437,2015-08-31 23:57:02,84,8,7
23996438,2015-08-31 23:58:02,84,8,7


In [9]:
# Plot correlations between the number of bikes in neighbour stations for each station and time slot

# get all station IDs
station_ids = status_df['station_id'].unique()

interval = 5 # time interval in minutes
window_width = 5 # window size

# for each station...
for ID in station_ids:
    neighbor_ids = trips_df[trips_df['end_id']==ID]['start_id'].unique()#NOTE: all stations have self loops
    # for each time slot...
    for slot, current_df in slots.items():
        merged_df = pd.DataFrame()
        #for each neighbor station...
        for neighbor in neighbor_ids:
            new_df = pd.DataFrame()
            single_station_df = current_df[current_df['station_id']==neighbor][['time', 'bikes_available']]
            windowing_df =  single_station_df.resample(f"{interval}T", on = 'time').mean()
            windowing_df  = windowing_df.reset_index() #reset index
            new_df['time'] = windowing_df['time']
            for i in reversed(range(window_width)):
                new_df[f'{neighbor}_T{i}'] =  windowing_df['bikes_available'].shift(periods=i)
            if len(merged_df) == 0:
                merged_df = new_df.copy(deep=True)
                continue
            merged_df = pd.merge(merged_df, new_df, how='outer', on='time')
        
        # save the dataframe with info about the single station neighborhood
        if not os.path.exists(f"../../Results/Correlations/Station_{ID}/Slot_{slot}_Interval_{interval}"):
            os.makedirs(f"../../Results/Correlations/Station_{ID}/Slot_{slot}_Interval_{interval}")
        merged_df.to_csv(f"../../Results/Correlations/Station_{ID}/Slot_{slot}_Interval_{interval}/status.csv", index=False)

        station_correlation = merged_df.corr()
        # save the dataframe with info about the single station correlations
        station_correlation.to_csv(f"../../Results/Correlations/Station_{ID}/Slot_{slot}_Interval_{interval}/correlations.csv",
                                index=False)

        plt.figure()
        sns.set(rc = {'figure.figsize':(19,15)})
        sns.heatmap(station_correlation, vmin=-1, vmax=1, cmap='coolwarm')
        plt.savefig(f"../../Results/Correlations/Station_{ID}/Slot_{slot}_Interval_{interval}/correlations_plot.jpg")
        plt.close()
    print("Done ", ID)# just to check computation status

Done  2
Done  3
Done  4
Done  5
Done  6
Done  7
Done  8
Done  9
Done  10
Done  11
Done  12
Done  13
Done  14
Done  16
Done  21
Done  22
Done  23
Done  24
Done  25
Done  26
Done  27
Done  28
Done  29
Done  30
Done  31
Done  32
Done  33
Done  34
Done  35
Done  36
Done  37
Done  38
Done  41
Done  42
Done  45
Done  46
Done  47
Done  48
Done  49
Done  50
Done  51
Done  39
Done  54
Done  55
Done  56
Done  57
Done  58
Done  59
Done  60
Done  61
Done  62
Done  63
Done  64
Done  65
Done  66
Done  67
Done  68
Done  69
Done  70
Done  71
Done  72
Done  73
Done  74
Done  75
Done  76
Done  77
Done  80
Done  82
Done  83
Done  84


In [7]:
# Plot correlations between the number of bikes in neighbour stations for each station and time slot:
# here links are considered undirected

# get all station IDs
station_ids = [70, 34, 72, 69, 73] #arbitrary set of stations

interval = 20 # time interval in minutes
window_width = 5 # window size

# for each station...
for ID in station_ids:
    neighbor_ids = set(trips_df[trips_df['end_id']==ID]['start_id'].unique())#NOTE: all stations have self loops
    for v in trips_df[trips_df['start_id']==ID]['end_id'].unique():
        neighbor_ids.add(v)
    # for each time slot...
    for slot, current_df in slots.items():
        merged_df = pd.DataFrame()
        #for each neighbor station...
        for neighbor in neighbor_ids:
            new_df = pd.DataFrame()
            single_station_df = current_df[current_df['station_id']==neighbor][['time', 'bikes_available']]
            windowing_df =  single_station_df.resample(f"{interval}T", on = 'time').mean()
            windowing_df  = windowing_df.reset_index() #reset index
            new_df['time'] = windowing_df['time']
            for i in reversed(range(window_width)):
                new_df[f'{neighbor}_T{i}'] =  windowing_df['bikes_available'].shift(periods=i)
            if len(merged_df) == 0:
                merged_df = new_df.copy(deep=True)
                continue
            merged_df = pd.merge(merged_df, new_df, how='outer', on='time')
        
        if not os.path.exists(f"../../Results/Correlations/Station_{ID}/Slot_{slot}_Interval_{interval}"):
            os.makedirs(f"../../Results/Correlations/Station_{ID}/Slot_{slot}_Interval_{interval}")

        station_correlation = merged_df.corr()
        
        plt.figure()
        sns.set(rc = {'figure.figsize':(19,15)})
        sns.heatmap(station_correlation, vmin=-1, vmax=1, cmap='coolwarm')
        plt.savefig(f"../../Results/Correlations/Station_{ID}/Slot_{slot}_Interval_{interval}/correlations_plot_undirected.jpg")
        plt.close()
    print("Done ", ID)# just to check computation status

Done  70
Done  34
Done  72
Done  69
Done  73
