In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
r'''
data: raw data from the original csv from Luis Berdun
    path: r'C:\Users\ing_l\Tesis grado\Data\old_BD_Parking_data\BD_Parking.csv'

converted_data: *data* processed to be the same shape as the new data obtained by the scraper.
    path: r'C:\Users\ing_l\Tesis grado\Data\old_BD_Parking_data\BD_parking_direccion.csv'

trusted_data: *converted_data* processed to keep only the trusted values.
    path: r'C:\Users\ing_l\Tesis grado\Data\old_BD_Parking_data\BD_parking_trusted_data.csv'
    
trusted_data_w_coordinates: *trusted_data* with latitud and longitud.
    path: r'C:\Users\ing_l\Tesis grado\Data\old_BD_Parking_data\BD_parking_trusted_data_coordinates.csv'
    
trusted_data_w_ocupacion: *trusted_data* with actual ocupacion included.
    path: r'C:\Users\ing_l\Tesis grado\Data\BD_parking_trusted_data_ocupacion.csv'
    
sumo_max_ocupacion: max ocupacion of the sumo data.
    path: r'C:\Users\ing_l\Tesis grado\Data\Extra_data\SUMO_max_ocupacion.csv'
'''

data_path = r'C:\Users\ing_l\Tesis grado\Data\old_BD_Parking_data\BD_Parking.csv'
converted_data_path = r'C:\Users\ing_l\Tesis grado\Data\old_BD_Parking_data\BD_parking_direccion.csv'
trusted_data_path = r'C:\Users\ing_l\Tesis grado\Data\old_BD_Parking_data\BD_parking_trusted_data.csv'
trusted_data_path_w_coordinates = r'C:\Users\ing_l\Tesis grado\Data\old_BD_Parking_data\BD_parking_trusted_data_coordinates.csv'
trusted_data_path_w_ocupacion = r'C:\Users\ing_l\Tesis grado\Data\BD_parking_trusted_data_ocupacion.csv'
sumo_max_ocupacion_path = r'C:\Users\ing_l\Tesis grado\Data\Extra_data\SUMO_max_ocupacion.csv'

In [3]:
#Read the original csv data (BD_Parking.csv) and parse the fecha columns into date values
data = pd.read_csv(data_path, delimiter=',', parse_dates=[['fecha', 'hora']])

In [4]:
#Keep the day in new dia column only for comparison
data['fecha'] = data['fecha_hora'].map(lambda x: x.date())

In [5]:
#Useless
data

Unnamed: 0,fecha_hora,id_cuadra,operacion,patente,tarjeta,fecha
0,2018-01-01 07:11:00,82,Entrada,BVQ775,10037637,2018-01-01
1,2018-01-01 11:26:00,40,Entrada,DCN224,10059452,2018-01-01
2,2018-01-01 13:09:00,61,Entrada,ATE568,10032179,2018-01-01
3,2018-01-01 16:53:00,34,Entrada,LPV400,10026609,2018-01-01
4,2018-01-01 17:25:00,66,Entrada,FMR105,10073847,2018-01-01
...,...,...,...,...,...,...
704746,2018-06-14 10:29:00,16,Salida,IXD347,10076465,2018-06-14
704747,2018-06-14 10:31:00,58,Salida,NJZ622,10168488,2018-06-14
704748,2018-06-14 10:33:00,19,Salida,MAD812,10003720,2018-06-14
704749,2018-06-14 10:30:00,21,Salida,FAI128,10176408,2018-06-14


In [6]:
def add_direccion():
    '''
    Process the same original data to have the direccion associated to each id_cuadra
    and save it to a file if it doesnt exists yet.
    
    Return:
        returns a dataframe containing the same values as *data* with a new column
        called direccion.
    '''

    #if we dont have the file, we must create it
    if not os.path.isfile(converted_data_path):

        converted_data = data.copy()
        #creating the new row called direccion where we will put the direccion value
        #of each id_cuadra
        converted_data['direccion'] = 0

        #open the file in which we have the dict id_cudra --> direccion
        idcuadra_to_direccion = pd.read_csv(r'C:\Users\ing_l\Tesis grado\Data\Extra_data\idcuadra_to_direccion.csv')

        #iterate over rows to save the data
        for i, dict_id_dir in idcuadra_to_direccion.iterrows():
            converted_data.loc[data['id_cuadra'] == dict_id_dir['id_cuadra'], 'direccion'] = dict_id_dir['direccion']

        #sorting the data by id_cuadra to be more readable
        converted_data = converted_data.sort_values(by=['patente', 'fecha_hora', 'operacion'])

        #delete the data where we dont have any idea the direccion
        converted_data = converted_data.loc[converted_data['direccion'] != 0]
        
        converted_data = converted_data.reset_index(drop=True)

        #Save the data with direccion so we dont need to process it every time
        print('Creating and saving the data in BD_parking_w_direccion.csv')
        converted_data.to_csv(converted_data_path, index=False)
        
        return converted_data
    #if we have already processed this data, we only read it from the csv
    else:
        print('Reading the data from BD_parking_w_direccion.csv')
        return pd.read_csv(converted_data_path, parse_dates=['fecha_hora'])

In [7]:
converted_data = add_direccion()

Reading the data from BD_parking_w_direccion.csv


In [8]:
#Useless
converted_data.loc[converted_data['id_cuadra'] == 2]

Unnamed: 0,fecha_hora,id_cuadra,operacion,patente,tarjeta,fecha,direccion
0,2018-06-02 11:12:00,2,Entrada,AA000BQ,10150463,2018-06-02,General Pinto 545
1,2018-06-02 11:23:00,2,Salida,AA000BQ,10150463,2018-06-02,General Pinto 545
250,2018-06-13 17:56:00,2,Entrada,AA002RL,10163699,2018-06-13,General Pinto 545
251,2018-06-13 18:49:00,2,Salida,AA002RL,10163699,2018-06-13,General Pinto 545
444,2018-04-24 17:49:00,2,Entrada,AA007XB,10144686,2018-04-24,General Pinto 545
...,...,...,...,...,...,...,...
704441,2018-05-28 10:47:00,2,Salida,XLW772,10167048,2018-05-28,General Pinto 545
704533,2018-05-11 10:35:00,2,Entrada,XNI737,10060315,2018-05-11,General Pinto 545
704534,2018-05-11 11:02:00,2,Salida,XNI737,10060315,2018-05-11,General Pinto 545
704729,2018-03-28 19:46:00,2,Entrada,YBP244,10077070,2018-03-28,General Pinto 545


In [9]:
#Useless
converted_data

Unnamed: 0,fecha_hora,id_cuadra,operacion,patente,tarjeta,fecha,direccion
0,2018-06-02 11:12:00,2,Entrada,AA000BQ,10150463,2018-06-02,General Pinto 545
1,2018-06-02 11:23:00,2,Salida,AA000BQ,10150463,2018-06-02,General Pinto 545
2,2018-03-02 13:04:00,80,Entrada,AA000UC,10166592,2018-03-02,General Paz 447
3,2018-03-02 15:24:00,80,Salida,AA000UC,10166592,2018-03-02,General Paz 447
4,2018-03-12 17:15:00,80,Entrada,AA000UC,10166592,2018-03-12,General Paz 447
...,...,...,...,...,...,...,...
704742,2018-02-22 16:34:00,67,Salida,YD256NN,10010345,2018-02-22,14 de Julio 347
704743,2018-01-03 19:23:00,14,Entrada,YUW066,10124581,2018-01-03,Rodriguez 464
704744,2018-01-03 19:23:00,14,Salida,YUW066,10124581,2018-01-03,Rodriguez 464
704745,2018-03-03 17:23:00,6,Entrada,YWJ374,10058727,2018-03-03,General Pinto 759


In [10]:
#Useless
def get_trusted_data_old():
    '''
    Process the converted_data to get only the trusted ones. Those are that have
    entrada/salida, same patente and same fecha. Save them to a file if they are
    not saved yet, otherwise read the csv that contain them.
    
    Return:
        returns a dataframe containing the trusted values of the converted_data.
    '''
    
    #if we dont have the file, we must create it
    if not os.path.isfile(trusted_data_path):

        
        #Initialize out_data (all the trusted rows) and last_row (last_row we visited)
        out_data = pd.DataFrame(columns=converted_data.columns.values)

        last_row = pd.DataFrame(columns=converted_data.columns.values)
        
        it = 0
        
        #Getting the trusted values...
        for i, row in converted_data.iterrows():
            
            #Log the iteration id
            print('Iteration: ', it)
            it += 1
            #Saving only the values who are Entrada-Salida of the same patente and fecha
            if not last_row.empty:  
                if last_row['patente'] == row['patente']:
                    if last_row['operacion'] == 'Entrada' and row['operacion'] == 'Salida':
                        if last_row['fecha'] == row['fecha']:
                            out_data = out_data.append(last_row, ignore_index=True)
                            out_data = out_data.append(row, ignore_index=True)
            last_row = row
            
        #Save the trusted data so we dont need to run it every time
        print('Creating and saving the data in BD_parking_trusted_data.csv')
        out_data.to_csv(trusted_data_path, index=False)
        return out_data
    
    else:
        #if we have already processed this data, we only read it from the csv
        print('Reading the data in BD_parking_trusted_data.csv')
        return pd.read_csv(trusted_data_path, parse_dates=['fecha_hora'])

In [11]:
#Useless
#Numpy arrays to compare values faster!!!!!!!!!!!
x= converted_data.copy()
print(x.values[1:,x.columns.get_loc('operacion')].shape)
print(x.values[:-1,x.columns.get_loc('operacion')].shape)
#print(type(x.values[1:,0] == x.values[:-1,0]))
fecha_values = (x.values[:-1, x.columns.get_loc('operacion')] != x.values[1:, x.columns.get_loc('operacion')])
fecha_values = (np.append(fecha_values,fecha_values[-1]))
x[fecha_values]

(704746,)
(704746,)


Unnamed: 0,fecha_hora,id_cuadra,operacion,patente,tarjeta,fecha,direccion
0,2018-06-02 11:12:00,2,Entrada,AA000BQ,10150463,2018-06-02,General Pinto 545
1,2018-06-02 11:23:00,2,Salida,AA000BQ,10150463,2018-06-02,General Pinto 545
2,2018-03-02 13:04:00,80,Entrada,AA000UC,10166592,2018-03-02,General Paz 447
3,2018-03-02 15:24:00,80,Salida,AA000UC,10166592,2018-03-02,General Paz 447
4,2018-03-12 17:15:00,80,Entrada,AA000UC,10166592,2018-03-12,General Paz 447
...,...,...,...,...,...,...,...
704742,2018-02-22 16:34:00,67,Salida,YD256NN,10010345,2018-02-22,14 de Julio 347
704743,2018-01-03 19:23:00,14,Entrada,YUW066,10124581,2018-01-03,Rodriguez 464
704744,2018-01-03 19:23:00,14,Salida,YUW066,10124581,2018-01-03,Rodriguez 464
704745,2018-03-03 17:23:00,6,Entrada,YWJ374,10058727,2018-03-03,General Pinto 759


In [12]:
#Useless
#Saving only the values who are Entrada-Salida of the same patente and fecha
x= converted_data.copy()

#Same fecha
column_idx = x.columns.get_loc('fecha')
first_comp = x.values[:-1 , column_idx]
second_comp = x.values[1: , column_idx]
first_result = first_comp == second_comp
first_result = np.append(first_result, first_result[-1])

#Same patente
column_idx = x.columns.get_loc('patente')
first_comp = x.values[:-1 , column_idx]
second_comp = x.values[1: , column_idx]
second_result = first_comp == second_comp
second_result = np.append(second_result, second_result[-1])

#Entrada-Salida
column_idx = x.columns.get_loc('operacion')
first_comp = x.values[:-1 , column_idx]
second_comp = x.values[1: , column_idx]
third_result = (first_comp == 'Entrada') * (second_comp == 'Salida')
third_result = np.append(third_result, third_result[-1])
'''to_include_salidas = np.insert(third_result, 0, True)
third_result = np.append(third_result, third_result[-1])
third_result = np.logical_or(third_result, to_include_salidas)
'''
final_result = first_result & second_result & third_result

to_include_salidas = np.delete(final_result, -1)
to_include_salidas = np.insert(to_include_salidas, 0, True)
final_result = np.logical_or(final_result, to_include_salidas)

print(to_include_salidas)
print(final_result)

x[final_result]

[ True  True False ...  True False  True]
[ True  True  True ...  True  True  True]


Unnamed: 0,fecha_hora,id_cuadra,operacion,patente,tarjeta,fecha,direccion
0,2018-06-02 11:12:00,2,Entrada,AA000BQ,10150463,2018-06-02,General Pinto 545
1,2018-06-02 11:23:00,2,Salida,AA000BQ,10150463,2018-06-02,General Pinto 545
2,2018-03-02 13:04:00,80,Entrada,AA000UC,10166592,2018-03-02,General Paz 447
3,2018-03-02 15:24:00,80,Salida,AA000UC,10166592,2018-03-02,General Paz 447
4,2018-03-12 17:15:00,80,Entrada,AA000UC,10166592,2018-03-12,General Paz 447
...,...,...,...,...,...,...,...
704742,2018-02-22 16:34:00,67,Salida,YD256NN,10010345,2018-02-22,14 de Julio 347
704743,2018-01-03 19:23:00,14,Entrada,YUW066,10124581,2018-01-03,Rodriguez 464
704744,2018-01-03 19:23:00,14,Salida,YUW066,10124581,2018-01-03,Rodriguez 464
704745,2018-03-03 17:23:00,6,Entrada,YWJ374,10058727,2018-03-03,General Pinto 759


In [13]:
#Useless
x= converted_data.copy()

#Entrada-Salida
column_idx = x.columns.get_loc('operacion')
first_comp = x.values[:-1 , column_idx]
second_comp = x.values[1: , column_idx]
third_result = (first_comp == 'Entrada') * (second_comp == 'Salida')
to_include_salidas = np.insert(third_result, 0, True)
third_result = np.append(third_result, third_result[-1])
third_result = np.logical_or(third_result, to_include_salidas)


third_result

array([ True,  True,  True, ...,  True,  True,  True])

In [14]:
def get_trusted_data(d, inplace=False):
    '''
    Process the d to get only the trusted ones. Those are that have
    entrada/salida, same patente and same fecha. Save them to a file if they are
    not saved yet, otherwise read the csv that contain them.
    
    Parameters:
        d: (DataFrame) dataframe model to filter the values
        inplace: (Boolean) if use a reference of d or make a copy instead.
        
    Return:
        returns a dataframe containing the trusted values of the d.
    '''
        
    #if we dont have the file, we must create it
    if not os.path.isfile(trusted_data_path):

        #To use a reference or a copy of the input
        if inplace:
            out = d
        else:
            out = d.copy()
            
        #Initialize out (all the trusted rows) and last_row (last_row we visited)
        
        it = 0
        
        #Getting the trusted values...
        #Saving only the values who are Entrada-Salida of the same patente and fecha
        
        #We must get only the values (in numpy array form) we need to compare.
        #We grab "even" and "odds" (not exactly) and we compare them to get the
        #trusted values...
        
        #Same fecha
        #This is because we want to know if they were parked and unparked 
        #in the same day, otherwise we can asume they forgotten to mark
        #the unparking, so we dont need that data.
        column_idx = out.columns.get_loc('fecha')
        first_comp = out.values[:-1 , column_idx]
        second_comp = out.values[1: , column_idx]
        first_result = first_comp == second_comp
        first_result = np.append(first_result, first_result[-1])

        #Same patente
        column_idx = out.columns.get_loc('patente')
        first_comp = out.values[:-1 , column_idx]
        second_comp = out.values[1: , column_idx]
        second_result = first_comp == second_comp
        second_result = np.append(second_result, second_result[-1])

        #Entrada-Salida
        column_idx = out.columns.get_loc('operacion')
        first_comp = out.values[:-1 , column_idx]
        second_comp = out.values[1: , column_idx]
        third_result = (first_comp == 'Entrada') * (second_comp == 'Salida')
        third_result = np.append(third_result, third_result[-1])

        final_result = first_result & second_result & third_result

        #To include salidas we must put true in the value next to each true idx
        to_include_salidas = np.delete(final_result, -1)
        to_include_salidas = np.insert(to_include_salidas, 0, True)
        
        final_result = np.logical_or(final_result, to_include_salidas)

        out = out[final_result]
        
        #Save the trusted data so we dont need to run it every time
        print('Creating and saving the data in BD_parking_trusted_data.csv')
        out.to_csv(trusted_data_path, index=False)
        
        if not inplace:
            return out
        
    else:
        #if we have already processed this data, we only read it from the csv
        print('Reading the data in BD_parking_trusted_data.csv')
        if inplace:
            out = pd.read_csv(trusted_data_path, parse_dates=['fecha_hora'])
        else:
            return pd.read_csv(trusted_data_path, parse_dates=['fecha_hora'])

In [15]:
trusted_data = get_trusted_data(converted_data)

Reading the data in BD_parking_trusted_data.csv


In [16]:
trusted_data = pd.read_csv(trusted_data_path, parse_dates=['fecha_hora'])

In [17]:
#To have only the necesary data and orderer by time, so we can know the ocupacion in any moment
trusted_data_reduced = trusted_data.loc[:, ['id_cuadra', 'operacion', 'patente', 'fecha_hora', 'fecha', 'direccion']]
trusted_data_reduced = trusted_data_reduced.sort_values(by=['fecha_hora','operacion'])

In [18]:
trusted_data_reduced.to_csv(trusted_data_path, index=False)

In [19]:
#Useless
trusted_data_reduced = pd.read_csv(trusted_data_path, parse_dates=['fecha_hora'])
trusted_data_reduced

Unnamed: 0,id_cuadra,operacion,patente,fecha_hora,fecha,direccion
0,82,Entrada,BVQ775,2018-01-01 07:11:00,2018-01-01,General Paz 647
1,82,Salida,BVQ775,2018-01-01 07:11:00,2018-01-01,General Paz 647
2,40,Entrada,DCN224,2018-01-01 11:26:00,2018-01-01,Mitre 348
3,40,Salida,DCN224,2018-01-01 11:28:00,2018-01-01,Mitre 348
4,61,Entrada,ATE568,2018-01-01 13:09:00,2018-01-01,Chacabuco 357
...,...,...,...,...,...,...
690665,15,Salida,AA637NP,2018-06-14 10:34:00,2018-06-14,General Pinto 631
690666,74,Salida,AA908DT,2018-06-14 10:34:00,2018-06-14,Maipu 653
690667,62,Salida,IUV114,2018-06-14 10:34:00,2018-06-14,9 de Julio 753
690668,9,Salida,KOZ001,2018-06-14 10:34:00,2018-06-14,San Martín 560


In [20]:
#Useless
trusted_data_reduced.loc[trusted_data_reduced.id_cuadra == 2]

Unnamed: 0,id_cuadra,operacion,patente,fecha_hora,fecha,direccion
25,2,Entrada,JYO177,2018-01-02 07:49:00,2018-01-02,General Pinto 545
29,2,Entrada,NBW636,2018-01-02 08:18:00,2018-01-02,General Pinto 545
32,2,Salida,NBW636,2018-01-02 08:24:00,2018-01-02,General Pinto 545
33,2,Entrada,ASU197,2018-01-02 08:27:00,2018-01-02,General Pinto 545
35,2,Entrada,DIN108,2018-01-02 08:31:00,2018-01-02,General Pinto 545
...,...,...,...,...,...,...
690233,2,Salida,JLF318,2018-06-14 09:17:00,2018-06-14,General Pinto 545
690256,2,Entrada,GQJ401,2018-06-14 09:28:00,2018-06-14,General Pinto 545
690316,2,Salida,GQJ401,2018-06-14 09:44:00,2018-06-14,General Pinto 545
690523,2,Entrada,NAF239,2018-06-14 10:17:00,2018-06-14,General Pinto 545


In [21]:
def add_coordinates(d, inplace=False):
    if not os.path.isfile(trusted_data_path_w_coordinates):
        
        #To use a reference or a copy of the input
        if inplace:
            out = d
        else:
            out = d.copy()
            
        idcuadra_to_coordinates = pd.read_csv(r'C:\Users\ing_l\Tesis grado\Data\Extra_data\idcuadra_to_coordenadas.csv')
        out = out.merge(idcuadra_to_coordinates, on='id_cuadra')
        
        out.to_csv(trusted_data_path_w_coordinates, index=False) 
        
        if not inplace:
            return out
    else:
        print('Reading the data in BD_parking_trusted_data_w_coordinates.csv')
        if not inplace:
            return pd.read_csv(trusted_data_path_w_coordinates, parse_dates=['fecha_hora'])
        else:
            out = pd.read_csv(trusted_data_path_w_coordinates, parse_dates=['fecha_hora'])

In [22]:
trusted_data_reduced_w_coordinates = add_coordinates(trusted_data_reduced, inplace=False)

Reading the data in BD_parking_trusted_data_w_coordinates.csv


In [23]:
#Useless
trusted_data_reduced_w_coordinates

Unnamed: 0,id_cuadra,operacion,patente,fecha_hora,fecha,direccion,latitud,longitud
0,82,Entrada,BVQ775,2018-01-01 07:11:00,2018-01-01,General Paz 647,-37.324173,-59.132848
1,82,Salida,BVQ775,2018-01-01 07:11:00,2018-01-01,General Paz 647,-37.324173,-59.132848
2,82,Entrada,KZD509,2018-01-02 09:06:00,2018-01-02,General Paz 647,-37.324173,-59.132848
3,82,Entrada,LSD853,2018-01-02 09:42:00,2018-01-02,General Paz 647,-37.324173,-59.132848
4,82,Salida,KZD509,2018-01-02 09:45:00,2018-01-02,General Paz 647,-37.324173,-59.132848
...,...,...,...,...,...,...,...,...
690665,92,Salida,PGW873,2018-06-11 10:23:00,2018-06-11,Tribunal de Faltas,-37.321185,-59.118682
690666,92,Entrada,DQF352,2018-06-12 12:58:00,2018-06-12,Tribunal de Faltas,-37.321185,-59.118682
690667,92,Salida,DQF352,2018-06-12 12:59:00,2018-06-12,Tribunal de Faltas,-37.321185,-59.118682
690668,92,Entrada,CSS238,2018-06-12 13:26:00,2018-06-12,Tribunal de Faltas,-37.321185,-59.118682


In [24]:
trusted_data_path_w_coordinates = pd.read_csv(trusted_data_path_w_coordinates, parse_dates=['fecha_hora'])

In [25]:
#Useless
def add_ocupacion_old(d, inplace=False):
    '''
    Includes a new column called ocupacion and adds one to a row
        if its an "Entrada" and substract one if its "Salida".
    
    Parameters:
        d: (DataFrame) dataframe model to include the new column
        inplace: (Boolean) if use a reference of d or make a copy instead.
        
    Return:
        out: the d reference or copy that contains the new column ocupacion
            with the correct values.
    
    '''
    
    #If we have already done this work, read the file instead of
    #repeating all the process
    if not os.path.isfile(trusted_data_path_w_ocupacion):
        
        #To use a reference or a copy of the input
        if inplace:
            out = d
        else:
            out = d.copy()

        #Create the new column
        out['ocupacion'] = 0
        
        #For every street ...
        for street in range(2, out['id_cuadra'].max(axis=0) + 1):
            
            print('Processing street nº: ', street)
            
            #if there is no data of that street, we can pass to the next one.
            if not out[out['id_cuadra'] == street].empty:
                
                #Get all the rows of the actual street
                actual_cuadra = out.loc[out['id_cuadra'] == street]

                #Get the last timestamp we have used
                last_fecha_hora = actual_cuadra['fecha_hora'].iloc[0].day

                last_ocupacion = 0

                #For all rows of that street
                for i in range(actual_cuadra.shape[0]):

                    actual_fecha_hora = actual_cuadra['fecha_hora'].iloc[i].day
                    #We want to know the actual row index to write out, otherwise 
                    #we must be using a copy instead of a reference, so we couldnt be
                    #able to replace the values
                    row_index = int(actual_cuadra.iloc[i].to_frame().columns.values[0])

                    #If the day has changed put last_ocupacion in 0, so we can restart
                    #every day (heuristic).
                    if last_fecha_hora != actual_fecha_hora:
                        last_fecha_hora = actual_cuadra['fecha_hora'].iloc[i]
                        last_ocupacion = 0

                    #If its an Entrada then sum 1 to ocupacion
                    if actual_cuadra['operacion'].iloc[i] == 'Entrada':
                        last_ocupacion += 1

                    #If its a Salida then subs 1 to ocupacion
                    elif actual_cuadra['operacion'].iloc[i] == 'Salida' and last_ocupacion > 0:
                        last_ocupacion -= 1

                    #Get the column ocupacion of the actual row so we can 
                    #put there the last_ocupacion
                    out.loc[out.index == row_index, 'ocupacion'] = last_ocupacion

        #save the file, this is time consuming algorithm, we dont want to do it
        #everytime
        print('Creating and saving the data in BD_parking_w_direccion_w_ocupacion.csv')
        out.to_csv(trusted_data_path_w_ocupacion, index=False)     
        if not inplace:
            return out
    else:
        if not inplace:
            #if we have already processed this data, we only read it from the csv
            print('Reading the data in BD_parking_trusted_data_w_ocupacion.csv')
            return pd.read_csv(trusted_data_path_w_ocupacion, parse_dates=['fecha_hora'])
        else:
            out = pd.read_csv(trusted_data_path_w_ocupacion, parse_dates=['fecha_hora'])

In [26]:
#Useless
#VER SI ESTO ES LO MISMO O ES MAS OPTIMO
actual_val = 0

def sum_ocup(op):
    
    global actual_val
    
    if (op == 'Entrada'):
        actual_val += 1
    else:
        if actual_val > 0:
            actual_val -= 1
    return actual_val
   
#Add ocupacion
x = trusted_data_reduced_w_coordinates.copy().reset_index()
x['ocupacion'] = x['operacion'].apply(sum_ocup)

x

Unnamed: 0,index,id_cuadra,operacion,patente,fecha_hora,fecha,direccion,latitud,longitud,ocupacion
0,0,82,Entrada,BVQ775,2018-01-01 07:11:00,2018-01-01,General Paz 647,-37.324173,-59.132848,1
1,1,82,Salida,BVQ775,2018-01-01 07:11:00,2018-01-01,General Paz 647,-37.324173,-59.132848,0
2,2,82,Entrada,KZD509,2018-01-02 09:06:00,2018-01-02,General Paz 647,-37.324173,-59.132848,1
3,3,82,Entrada,LSD853,2018-01-02 09:42:00,2018-01-02,General Paz 647,-37.324173,-59.132848,2
4,4,82,Salida,KZD509,2018-01-02 09:45:00,2018-01-02,General Paz 647,-37.324173,-59.132848,1
...,...,...,...,...,...,...,...,...,...,...
690665,690665,92,Salida,PGW873,2018-06-11 10:23:00,2018-06-11,Tribunal de Faltas,-37.321185,-59.118682,83
690666,690666,92,Entrada,DQF352,2018-06-12 12:58:00,2018-06-12,Tribunal de Faltas,-37.321185,-59.118682,84
690667,690667,92,Salida,DQF352,2018-06-12 12:59:00,2018-06-12,Tribunal de Faltas,-37.321185,-59.118682,83
690668,690668,92,Entrada,CSS238,2018-06-12 13:26:00,2018-06-12,Tribunal de Faltas,-37.321185,-59.118682,84


In [27]:
def add_ocupacion(d, inplace=False):
    '''
    Includes a new column called ocupacion and adds one to a row
        if its an "Entrada" and substract one if its "Salida".
    
    Parameters:
        d: (DataFrame) dataframe model to include the new column
        inplace: (Boolean) if use a reference of d or make a copy instead.
        
    Return:
        out: the d reference or copy that contains the new column ocupacion
            with the correct values.
    
    '''
    
    #If we have already done this work, read the file instead of
    #repeating all the process
    if not os.path.isfile(trusted_data_path_w_ocupacion):
        
        def sum_ocup(row):
            '''
            Just for the apply func. If the row value is Entrada we sum 1,
            otherwise we subs 1.
            On each day we restart the actual_ocup.
            '''
            nonlocal actual_ocup, last_day
            
            if pd.to_datetime(row['fecha']).day != last_day:
                actual_ocup = 0
                last_day = pd.to_datetime(row['fecha']).day
                
            if row['operacion'] == 'Entrada':
                actual_ocup += 1
            else:
                if row['operacion'] == 'Salida':
                    if actual_ocup > 0:
                        actual_ocup -= 1
            return actual_ocup
        
        #To use a reference or a copy of the input
        if inplace:
            out = d
        else:
            out = d.copy()
        
        #For every street ...
        for street in range(2, out['id_cuadra'].max(axis=0) + 1):
            
            print('Processing street nº: ', street)
            
            actual_ocup = 0
            last_day = pd.to_datetime(out['fecha'].min()).day
            out.loc[out['id_cuadra'] == street, 'ocupacion'] = out.loc[out['id_cuadra'] == street, ['operacion', 'fecha']].apply(sum_ocup, axis=1)
            
        print('Creating and saving the data in BD_parking_w_direccion_w_ocupacion.csv')
        out.to_csv(trusted_data_path_w_ocupacion, index=False)     
        if not inplace:
            return out
    else:
        print('Reading the data in BD_parking_trusted_data_w_ocupacion.csv')
        if not inplace:
            #if we have already processed this data, we only read it from the csv
            return pd.read_csv(trusted_data_path_w_ocupacion, parse_dates=['fecha_hora'])
        else:
            out = pd.read_csv(trusted_data_path_w_ocupacion, parse_dates=['fecha_hora'])

In [28]:
trusted_data_reduced_w_ocupacion = add_ocupacion(trusted_data_reduced_w_coordinates, inplace=False)

Reading the data in BD_parking_trusted_data_w_ocupacion.csv


In [55]:
trusted_data_reduced_w_ocupacion = pd.read_csv(trusted_data_path_w_ocupacion)
trusted_data_reduced_w_ocupacion

Unnamed: 0,id_cuadra,operacion,patente,fecha_hora,fecha,direccion,latitud,longitud,ocupacion
0,82,Entrada,BVQ775,2018-01-01 07:11:00,2018-01-01,General Paz 647,-37.324173,-59.132848,1.0
1,82,Salida,BVQ775,2018-01-01 07:11:00,2018-01-01,General Paz 647,-37.324173,-59.132848,0.0
2,82,Entrada,KZD509,2018-01-02 09:06:00,2018-01-02,General Paz 647,-37.324173,-59.132848,1.0
3,82,Entrada,LSD853,2018-01-02 09:42:00,2018-01-02,General Paz 647,-37.324173,-59.132848,2.0
4,82,Salida,KZD509,2018-01-02 09:45:00,2018-01-02,General Paz 647,-37.324173,-59.132848,1.0
...,...,...,...,...,...,...,...,...,...
690665,92,Salida,PGW873,2018-06-11 10:23:00,2018-06-11,Tribunal de Faltas,-37.321185,-59.118682,0.0
690666,92,Entrada,DQF352,2018-06-12 12:58:00,2018-06-12,Tribunal de Faltas,-37.321185,-59.118682,1.0
690667,92,Salida,DQF352,2018-06-12 12:59:00,2018-06-12,Tribunal de Faltas,-37.321185,-59.118682,0.0
690668,92,Entrada,CSS238,2018-06-12 13:26:00,2018-06-12,Tribunal de Faltas,-37.321185,-59.118682,1.0
