In [1]:
'''
This code is inspired by https://github.com/GiuseppeMoscarelli/Tesi_Associative_Classifier
'''
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import datetime as dt
import dateutil
import seaborn as sns


Bad key "text.kerning_factor" on line 4 in
/opt/anaconda3/envs/bigdatalab_cpu_202101/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.2/matplotlibrc.template
or from the matplotlib source distribution


In [3]:
filtered_status_dir = "../../Data/filtered_status.csv"
status_path = "../../Data/status.csv"
stations_path = "../../Data/station.csv"
trips_path = "../../Data/trip_graph.csv"

stations_df = pd.read_csv(stations_path)
trips_df = pd.read_csv(trips_path)

if os.path.exists(filtered_status_dir):
    status_df = pd.read_csv(filtered_status_dir, parse_dates=['time'])
else:
    # EXECUTED ONCE TO GET FILTERED DATA
    # filtering out rows where the real number of docks available is different from the value it should have 
    # in the column 'docks_available', with abs(difference) > 5
    status_df = pd.read_csv(status_path, parse_dates=['time'])
    
    status_df['current_total'] = status_df['bikes_available'] + status_df['docks_available']
    dropped_stations_df = stations_df.drop(["name" ,"lat", "long", "city", "installation_date"], axis=1 )
    dropped_stations_df = dropped_stations_df.rename(columns={'id': 'station_id'})

    complete_status_df = pd.merge(status_df, dropped_stations_df, on='station_id', how='outer')
    complete_status_df['oscillation'] = complete_status_df['current_total'] - complete_status_df['dock_count']
    oscillation_mask = np.abs(complete_status_df['oscillation'])>5

    filtered_complete_status_df = complete_status_df.drop(complete_status_df[oscillation_mask].index)
    filtered_dropped_status = filtered_complete_status_df.drop(["current_total", "dock_count",
                                                                "oscillation"], axis=1)
    filtered_complete_status_df = complete_status_df.drop(complete_status_df[oscillation_mask].index)
    filtered_dropped_status = filtered_complete_status_df.drop(["current_total", "dock_count",
                                                                "oscillation"], axis=1)

    filtered_dropped_status.to_csv("../../Data/filtered_status.csv", index=False )
    status_df = filtered_dropped_status

In [7]:
trips_df

Unnamed: 0,start_id,end_id,weight
0,2,2,191
1,2,3,448
2,2,4,2200
3,2,5,627
4,2,6,1274
...,...,...,...
1746,84,13,15
1747,84,14,32
1748,84,16,5
1749,84,80,427


In [3]:
status_df

Unnamed: 0,station_id,bikes_available,docks_available,time
0,2,2,25,2013-08-29 12:06:01
1,2,2,25,2013-08-29 12:07:01
2,2,2,25,2013-08-29 12:08:01
3,2,2,25,2013-08-29 12:09:01
4,2,2,25,2013-08-29 12:10:01
...,...,...,...,...
71977905,84,8,7,2015-08-31 23:55:02
71977906,84,8,7,2015-08-31 23:56:01
71977907,84,8,7,2015-08-31 23:57:02
71977908,84,8,7,2015-08-31 23:58:02


In [6]:
# Plot correlations between the number of bikes in neighbour stations for each station

# get all station IDs
station_ids = status_df['station_id'].unique()

interval = 5 # time interval in minutes
window_width = 5 # window size

# for each station...
for ID in station_ids:
    merged_df = pd.DataFrame()
    neighbor_ids = trips_df[trips_df['end_id']==ID]['start_id'].unique()#NOTE: all stations have self loops
    #for each neighbor station...
    for neighbor in neighbor_ids:
        new_df = pd.DataFrame()
        single_station_df = status_df[status_df['station_id']==neighbor][['time', 'bikes_available']]
        windowing_df =  single_station_df.resample(f"{interval}T", on = 'time').mean()
        windowing_df  = windowing_df.reset_index() #reset index
        new_df['time'] = windowing_df['time']
        for i in reversed(range(window_width)):
            new_df[f'{neighbor}_T{i}'] =  windowing_df['bikes_available'].shift(periods=i)
        if len(merged_df) == 0:
            merged_df = new_df.copy(deep=True)
            continue
        merged_df = pd.merge(merged_df, new_df, how='outer', on='time')
        
    # save the dataframe with info about the single station neighborhood
    if not os.path.exists(f"../../Results/Correlations/Station_{ID}/Interval_{interval}"):
        os.makedirs(f"../../Results/Correlations/Station_{ID}/Interval_{interval}")
    merged_df.to_csv(f"../../Results/Correlations/Station_{ID}/Interval_{interval}/status.csv", index=False)

    station_correlation = merged_df.corr()
    # save the dataframe with info about the single station correlations
    station_correlation.to_csv(f"../../Results/Correlations/Station_{ID}/Interval_{interval}/correlations.csv",
                            index=False)
    
    plt.figure()
    sns.set(rc = {'figure.figsize':(19,15)})
    sns.heatmap(station_correlation, vmin=-1, vmax=1, cmap='coolwarm')
    plt.savefig(f"../../Results/Correlations/Station_{ID}/Interval_{interval}/correlations_plot.jpg")
    plt.close()
    print("Done ", ID)# just to check computation status

Done  2
Done  3
Done  4
Done  5
Done  6
Done  7
Done  8
Done  9
Done  10
Done  11
Done  12
Done  13
Done  14
Done  16
Done  21
Done  22
Done  23
Done  24
Done  25
Done  26
Done  27
Done  28
Done  29
Done  30
Done  31
Done  32
Done  33
Done  34
Done  35
Done  36
Done  37
Done  38
Done  41
Done  42
Done  45
Done  46
Done  47
Done  48
Done  49
Done  50
Done  51
Done  39
Done  54
Done  55
Done  56
Done  57
Done  58
Done  59
Done  60
Done  61
Done  62
Done  63
Done  64
Done  65
Done  66
Done  67
Done  68
Done  69
Done  70
Done  71
Done  72
Done  73
Done  74
Done  75
Done  76
Done  77
Done  80
Done  82
Done  83
Done  84


In [15]:
# Plot correlations between the number of bikes in neighbour stations for each station:
# here links are considered undirected

# get all station IDs
station_ids = [70, 76, 34, 72, 75, 69, 50, 60, 73, 82] #arbitrary set of stations

interval = 20 # time interval in minutes
window_width = 5 # window size

# for each station...
for ID in station_ids:
    merged_df = pd.DataFrame()
    neighbor_ids = set(trips_df[trips_df['end_id']==ID]['start_id'].unique())#NOTE: all stations have self loops
    for v in trips_df[trips_df['start_id']==ID]['end_id'].unique():
        neighbor_ids.add(v)
    #for each neighbor station...
    for neighbor in neighbor_ids:
        new_df = pd.DataFrame()
        single_station_df = status_df[status_df['station_id']==neighbor][['time', 'bikes_available']]
        windowing_df =  single_station_df.resample(f"{interval}T", on = 'time').mean()
        windowing_df  = windowing_df.reset_index() #reset index
        new_df['time'] = windowing_df['time']
        for i in reversed(range(window_width)):
            new_df[f'{neighbor}_T{i}'] =  windowing_df['bikes_available'].shift(periods=i)
        if len(merged_df) == 0:
            merged_df = new_df.copy(deep=True)
            continue
        merged_df = pd.merge(merged_df, new_df, how='outer', on='time')
        
    if not os.path.exists(f"../../Results/Correlations/Station_{ID}/Interval_{interval}"):
        os.makedirs(f"../../Results/Correlations/Station_{ID}/Interval_{interval}")

    station_correlation = merged_df.corr()
    # save the dataframe with info about the single station correlations
    
    plt.figure()
    sns.set(rc = {'figure.figsize':(19,15)})
    sns.heatmap(station_correlation, vmin=-1, vmax=1, cmap='coolwarm')
    plt.savefig(f"../../Results/Correlations/Station_{ID}/Interval_{interval}/correlations_plot_undirected.jpg")
    plt.close()
    print("Done ", ID)# just to check computation status

Done  70
Done  76
Done  34
Done  72
Done  75
Done  69
Done  50
Done  60
Done  73
Done  82
