In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.metrics import mean_squared_error
from math import sin, cos, sqrt, atan2, radians 

import dateutil

from pathlib import Path

In [2]:
status_path = Path('../filtered_status.csv')
stations_path = Path('../station.csv')

In [3]:
status_df = pd.read_csv(status_path, parse_dates=['time'])
stations_df = pd.read_csv(stations_path)

In [4]:
status_df

Unnamed: 0,station_id,bikes_available,docks_available,time
0,2,2,25,2013-08-29 12:06:01
1,2,2,25,2013-08-29 12:07:01
2,2,2,25,2013-08-29 12:08:01
3,2,2,25,2013-08-29 12:09:01
4,2,2,25,2013-08-29 12:10:01
...,...,...,...,...
71977905,84,8,7,2015-08-31 23:55:02
71977906,84,8,7,2015-08-31 23:56:01
71977907,84,8,7,2015-08-31 23:57:02
71977908,84,8,7,2015-08-31 23:58:02


In [5]:
stations_df

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date
0,2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,8/6/2013
1,3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013
2,4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,8/6/2013
3,5,Adobe on Almaden,37.331415,-121.893200,19,San Jose,8/5/2013
4,6,San Pedro Square,37.336721,-121.894074,15,San Jose,8/7/2013
...,...,...,...,...,...,...,...
65,77,Market at Sansome,37.789625,-122.400811,27,San Francisco,8/25/2013
66,80,Santa Clara County Civic Center,37.352601,-121.905733,15,San Jose,12/31/2013
67,82,Broadway St at Battery St,37.798541,-122.400862,15,San Francisco,1/22/2014
68,83,Mezes Park,37.491269,-122.236234,15,Redwood City,2/20/2014


### Salvo lo status soltando della stazioni appartenenti alla città di San Francisco

In [5]:
#il seguente vettore è stato ottenuto in analisi fatte precedentemente. 
SFancisco_stations = [41, 42, 45, 46, 47, 48, 49, 50, 51, 39, 54, 55, 56, 57, 58, 59, 60, 
                      61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 82]

SanFrancisco_status = status_df[status_df['station_id'].isin(SFancisco_stations)]

In [None]:
SanFrancisco_status.to_csv("./Datasets/SanFrancisco_status.csv", index=False )

### Splitto il file SanFrancisco_status in training e test set

In [14]:
#La seguente data di split è stata ottenuta prendendo la prima entry del dataset test_df ottenuto nella classificazione.
#In questo modo posso rispettare le proporzioni dello splitting usato per allenare i modelli classici
split_date = "2015-03-09 09:30:00"

train_df = SanFrancisco_status[SanFrancisco_status['time'] < split_date]
test_df = SanFrancisco_status[SanFrancisco_status['time'] >= split_date]

train_df.to_csv(f"./Datasets/SanFrancisco_status_train.csv", index=False)
test_df.to_csv(f"./Datasets/SanFrancisco_status_test.csv", index=False)

### Calcolo la distanza tra le diverse stazioni della città di San Francisco

In [16]:
#function to retrieve distance between 2 stations
def getDistance(lat_a, long_a, lat_b, long_b):
    # approximate radius of earth in km
    R = 6373.0    
    lat1=radians(lat_a)
    lat2=radians(lat_b)
    lon1=radians(long_a)
    lon2=radians(long_b)
    
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

In [22]:
coordinates_stations_df = stations_df[stations_df["id"].isin(SFancisco_stations)]
coordinates_stations_df = coordinates_stations_df[["id", "lat", "long"]]
records = coordinates_stations_df.to_records(index=False)
list_coo = list(records)
# print(list_coo)

voc_distances={}
for i in range(len(list_coo)):
    for j in range(i+1,len(list_coo)):
        station1=list_coo[i][0]
        station2=list_coo[j][0]
        lat_i=float(list_coo[i][1])
        long_i=float(list_coo[i][2])
        lat_j=float(list_coo[j][1])
        long_j=float(list_coo[j][2])
        distance=getDistance(lat_i, long_i, lat_j, long_j)
        id_stations=str(station1)+' '+str(station2)
        voc_distances[id_stations]=distance


In [None]:
voc_distances

In [None]:
distance_dictionary = {}

delta0_stations = []
delta1_stations = []
delta2_stations = []

for station_id in SFancisco_stations:
    for key in voc_distances.keys():
        if str(station_id) in key:
            distance = voc_distances[key]
            if key.split(" ")[0] == str(station_id):
                other=key.split(" ")[1]
            else:
                other=key.split(" ")[0]
                
            distance_dictionary[station_id] = []
            if distance<=1:
                delta0_stations.append(other)         
            elif distance<=2:
                delta1_stations.append(other)
            elif distance<=3:
                delta2_stations.append(other)

    distance_dictionary[station_id].append(list(delta0_stations))
    distance_dictionary[station_id].append(list(delta1_stations))
    distance_dictionary[station_id].append(list(delta2_stations))
# distance_dictionary