In [1]:
import pandas as pd 
import numpy as np
import geopandas as gpd
import fiona
import os
import zipfile
import shapely
import matplotlib
from ipywidgets import interact
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.tsa 
from datetime import datetime
import missingno as msno
import re

In [2]:
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'
class Databases:
    def __init__(self, csv, anva, smm, metro):
        self.csv = csv
        self.anva = anva 
        self.smm = smm
        self.metro = metro

    def extract_csv(self):
        file = pd.read_csv(f'data/bike_loans/{self.csv}', sep=';')
        file['Fecha_Prestamo'] = pd.to_datetime(file['Fecha_Prestamo'])
        file['Fecha_Devolucion'] = pd.to_datetime(file['Fecha_Devolucion'])
        return file
        
    
    def extract_anva(self):
        bus_routes_AMVA = gpd.GeoDataFrame()
        for root, dirs, files in os.walk(f'data/bus_routes/medellin_bus_routes/{self.anva}'):
            for file in files:
                path = os.path.join(root, file)
                df = gpd.read_file(path, driver='KML')
                bus_routes_AMVA = gpd.GeoDataFrame(pd.concat([bus_routes_AMVA, df], ignore_index=True))
        return bus_routes_AMVA
    
    def extract_smm(self):
        smm_bus_routes = gpd.GeoDataFrame()
        for root, dirs, files in os.walk(f'data/bus_routes/medellin_bus_routes/{self.smm}'):
            for file in files:
                path = os.path.join(root, file)
                df = gpd.read_file(path, driver='KML')
                smm_bus_routes = gpd.GeoDataFrame(pd.concat([smm_bus_routes, df], ignore_index=True))
        return smm_bus_routes
    
    def extract_metro(self):
        metro_routes = gpd.GeoDataFrame()
        for root, dirs, files in os.walk(f'data/medellin_metro/{self.metro}'):
            for file in files:
                path = os.path.join(root, file)
                print(path)
                if path.endswith('.kmz'):
                    kmz = zipfile.ZipFile(path, 'r')
                    kml = kmz.open('doc.kml', 'r')
                    df = gpd.read_file(kml, driver='KML')
                    metro_routes = gpd.GeoDataFrame(pd.concat([metro_routes, df], ignore_index=True))
        return metro_routes

In [3]:
pd.set_option('display.max_rows', 100)

In [4]:
databases = Databases('bike_loans.csv','Kml Ruta AMVA','kml Ruta SMM','medellin_metro_stations')

In [5]:
bikes = databases.extract_csv()

In [6]:
bikes.dtypes #checking feature types

Id_Historico_Prestamo               int64
Id_Usuario                          int64
Id_Tag_Bicicleta                   object
Fecha_Prestamo             datetime64[ns]
Fecha_Devolucion           datetime64[ns]
Id_Aparcamiento_Origen              int64
Posicion_Origen                     int64
Id_Aparcamiento_Destino           float64
Posicion_Destino                  float64
Importe                           float64
DescripcionImporte                float64
operario                           object
Num_Bici_Hist                     float64
dtype: object

In [7]:
bikes_2019 = bikes[(bikes['Fecha_Prestamo'].dt.year == 2019) & (bikes['Fecha_Devolucion'].dt.year == 2019)]
bikes_2019

Unnamed: 0,Id_Historico_Prestamo,Id_Usuario,Id_Tag_Bicicleta,Fecha_Prestamo,Fecha_Devolucion,Id_Aparcamiento_Origen,Posicion_Origen,Id_Aparcamiento_Destino,Posicion_Destino,Importe,DescripcionImporte,operario,Num_Bici_Hist
440864,10948569,20272,00FB73222B5E,2019-02-20 14:08:04,2019-02-20 14:26:08,9,1,43.0,1.0,0.0,,anderson.rojas,1366.0
586995,12896369,135209,002772222B5E,2019-08-22 09:19:51,2019-08-22 09:20:31,44,15,44.0,15.0,0.0,,,976.0
726893,12896264,26536,00094A72BF5B,2019-08-22 09:11:22,2019-08-22 09:20:53,9,1,70.0,1.0,0.0,,seguro.sorelly,2699.0
1028721,10948678,124512,00233C222B5E,2019-02-20 14:15:54,2019-02-20 14:26:28,89,2,56.0,1.0,0.0,,,638.0
2111125,10516625,40806,009F53222B5E,2019-01-02 12:57:01,2019-01-02 13:51:38,12,1,22.0,,0.0,,monitor.jannetv,558.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11767100,14454800,27944,0000F69A5A88,2019-12-31 13:58:55,2019-12-31 14:25:38,40,1,50.0,1.0,0.0,,claudia.marin,541.0
11767101,14454778,108800,00029A222B5E,2019-12-31 13:52:48,2019-12-31 14:40:52,28,1,50.0,1.0,0.0,,claudia.marin,715.0
11767102,14454793,98449,00001619CE0A,2019-12-31 13:55:47,2019-12-31 14:41:39,113,1,28.0,1.0,0.0,,edison.graciano,3497.0
11767103,14454774,129665,000036885A88,2019-12-31 13:51:28,2019-12-31 16:27:12,39,15,41.0,15.0,0.0,,,534.0


In [74]:
bikes_2019['loan_date_hour'] = bikes_2019['Fecha_Prestamo'].dt.date.astype('string') + " " +\
        bikes_2019['Fecha_Prestamo'].dt.hour.astype('string')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bikes_2019['loan_date_hour'] = bikes_2019['Fecha_Prestamo'].dt.date.astype('string') + " " +\


In [75]:
bikes_2019['deposit_date_hour'] = bikes_2019['Fecha_Devolucion'].dt.date.astype('string') + " " + \
        bikes_2019['Fecha_Devolucion'].dt.hour.astype('string')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bikes_2019['deposit_date_hour'] = bikes_2019['Fecha_Devolucion'].dt.date.astype('string') + " " + \


In [76]:
def convert_to_date(x):
    return datetime.strptime(x, '%Y-%m-%d %H')

In [77]:
bikes_2019['loan_date_hour'] = df1_clean_drop['year_month_day_hour'].apply(convert_to_date)
bikes_2019['deposit_date_hour'] = df2_clean_drop['year_month_day_hour'].apply(convert_to_date)

NameError: name 'df1_clean_drop' is not defined

hours = ['2019-01-01 00h00m00', ......, '2019-12-31 23h00m00']
bikes = ['000017B00743', ...]
dataframe = all combinations of bike hours
for bike
     for hour
        row = data(max Fecha_Devolucion < hour)
        location = row[Id_Aparcamiento_Destino]
        
end = date_hour/bike/location
groupby hour location - count() -> #bikes per station per hour
hours ->left_join ->nan is 0 bikes
```

In [83]:
hours = pd.DataFrame({'hour': pd.date_range(start='2019-01-01', end='2019-12-31', freq='H')})

In [84]:
hours

Unnamed: 0,hour
0,2019-01-01 00:00:00
1,2019-01-01 01:00:00
2,2019-01-01 02:00:00
3,2019-01-01 03:00:00
4,2019-01-01 04:00:00
...,...
8732,2019-12-30 20:00:00
8733,2019-12-30 21:00:00
8734,2019-12-30 22:00:00
8735,2019-12-30 23:00:00


In [97]:
bikes = pd.DataFrame({'bike' :bikes_2019['Id_Tag_Bicicleta'].unique()})

In [98]:
bikes

Unnamed: 0,bike
0,00FB73222B5E
1,002772222B5E
2,00094A72BF5B
3,00233C222B5E
4,009F53222B5E
...,...
2555,00000616CE0A
2556,0000F615CE0A
2557,0000C680CE0A
2558,0000561ACE0A


In [99]:
hours['key'] = 1
bikes['key'] = 1
location_bikes = hours.merge(bikes, on = 'key').drop('key', axis = 1)  

In [100]:
#location_bikes.index
location_bikes.head()

Unnamed: 0,hour,bike
0,2019-01-01,00FB73222B5E
1,2019-01-01,002772222B5E
2,2019-01-01,00094A72BF5B
3,2019-01-01,00233C222B5E
4,2019-01-01,009F53222B5E


In [101]:
print(location_bikes.shape)
print(location_bikes.hour[0])
location_bikes['location'] = ''
location_bikes.set_index('hour', inplace = True)
location_bikes.head()

(22366720, 2)
2019-01-01 00:00:00


Unnamed: 0_level_0,bike,location
hour,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01,00FB73222B5E,
2019-01-01,002772222B5E,
2019-01-01,00094A72BF5B,
2019-01-01,00233C222B5E,
2019-01-01,009F53222B5E,


In [102]:
hour = hours.hour[0]
hour

Timestamp('2019-01-01 00:00:00')

In [104]:
bike = location_bikes.bike[0]
bike

'00FB73222B5E'

In [107]:
bike_logs = bikes_2019[bikes_2019.Id_Tag_Bicicleta == bike]
bike_logs

Unnamed: 0,Id_Historico_Prestamo,Id_Usuario,Id_Tag_Bicicleta,Fecha_Prestamo,Fecha_Devolucion,Id_Aparcamiento_Origen,Posicion_Origen,Id_Aparcamiento_Destino,Posicion_Destino,Importe,DescripcionImporte,operario,Num_Bici_Hist,deposit_station_date_time,loan_station_date_time,loan_date_hour,deposit_date_hour
440864,10948569,20272,00FB73222B5E,2019-02-20 14:08:04,2019-02-20 14:26:08,9,1,43.0,1.0,0.0,,anderson.rojas,1366.0,43.0 2019-02-20 14,9 2019-02-20 14,2019-02-20 14,2019-02-20 14
6280460,10948859,122320,00FB73222B5E,2019-02-20 14:27:58,2019-02-20 14:29:32,43,1,43.0,1.0,0.0,,anderson.rojas,1366.0,43.0 2019-02-20 14,43 2019-02-20 14,2019-02-20 14,2019-02-20 14
7949600,10567470,95167,00FB73222B5E,2019-01-11 15:59:42,2019-01-11 16:16:24,22,9,36.0,4.0,0.0,,,1366.0,36.0 2019-01-11 16,22 2019-01-11 15,2019-01-11 15,2019-01-11 16
7949843,10567695,39125,00FB73222B5E,2019-01-11 16:18:27,2019-01-11 16:36:53,36,1,1.0,1.0,0.0,,carolina.monsalve,1366.0,1.0 2019-01-11 16,36 2019-01-11 16,2019-01-11 16,2019-01-11 16
7962277,10514886,120750,00FB73222B5E,2019-01-02 07:47:45,2019-01-02 08:20:34,1,1,44.0,27.0,0.0,,,1366.0,44.0 2019-01-02 8,1 2019-01-02 7,2019-01-02 7,2019-01-02 8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11341323,13962649,136169,00FB73222B5E,2019-11-15 08:02:54,2019-11-15 08:08:43,49,18,100.0,2.0,0.0,,,1366.0,100.0 2019-11-15 8,49 2019-11-15 8,2019-11-15 8,2019-11-15 8
11341744,13963079,38300,00FB73222B5E,2019-11-15 08:28:57,2019-11-15 08:37:17,21,1,41.0,24.0,0.0,,,1366.0,41.0 2019-11-15 8,21 2019-11-15 8,2019-11-15 8,2019-11-15 8
11346947,13959269,40675,00FB73222B5E,2019-11-14 19:51:32,2019-11-14 20:15:06,96,16,48.0,3.0,0.0,,,1366.0,48.0 2019-11-14 20,96 2019-11-14 19,2019-11-14 19,2019-11-14 20
11348181,13960716,134506,00FB73222B5E,2019-11-15 06:31:01,2019-11-15 06:49:01,48,3,49.0,18.0,0.0,,,1366.0,49.0 2019-11-15 6,48 2019-11-15 6,2019-11-15 6,2019-11-15 6


In [112]:
last_log = bike_logs[bike_logs.Fecha_Devolucion <= index].sort_values(by = 'Fecha_Devolucion', ascending = False).head(1)

In [113]:
last_log

Unnamed: 0,Id_Historico_Prestamo,Id_Usuario,Id_Tag_Bicicleta,Fecha_Prestamo,Fecha_Devolucion,Id_Aparcamiento_Origen,Posicion_Origen,Id_Aparcamiento_Destino,Posicion_Destino,Importe,DescripcionImporte,operario,Num_Bici_Hist,deposit_station_date_time,loan_station_date_time,loan_date_hour,deposit_date_hour


In [111]:
location_bikes

Unnamed: 0_level_0,bike,location
hour,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01,00FB73222B5E,
2019-01-01,002772222B5E,
2019-01-01,00094A72BF5B,
2019-01-01,00233C222B5E,
2019-01-01,009F53222B5E,
...,...,...
2019-12-31,00000616CE0A,
2019-12-31,0000F615CE0A,
2019-12-31,0000C680CE0A,
2019-12-31,0000561ACE0A,


In [117]:
bikes

Unnamed: 0,bike,key
0,00FB73222B5E,1
1,002772222B5E,1
2,00094A72BF5B,1
3,00233C222B5E,1
4,009F53222B5E,1
...,...,...
2555,00000616CE0A,1
2556,0000F615CE0A,1
2557,0000C680CE0A,1
2558,0000561ACE0A,1


In [None]:
# problem: indexes are called the same (might be a problem)
for bike in bikes.bike:
#     print(f'handling bike {bike}')
    bike_logs = bikes_2019[bikes_2019.Id_Tag_Bicicleta == bike]
    # to decrease computational time: filter the location bikes only by the apparent hrs 
    # which appear in the filtered bike logs
    for index, row  in location_bikes.iterrows():
#         print(f'checking hour: {index}')
        if row['bike'] == bike:
            print(f'detected the bike...')
            last_log = bike_logs[bike_logs.Fecha_Devolucion <= index].sort_values(by = 'Fecha_Devolucion', ascending = False).head(1)
            location = last_log['Id_Aparcamiento_Destino']
            location_bikes.loc[index,'location'] = location
