## Before doing all of this, be sure of copy the data from the server, then delete it from there!

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
r'''
data: (DataFrame) raw data from the csv obtained by the scraper (without any possible duplicated data)
    path: C:\Users\ing_l\Tesis grado\Data\SUMO_data.csv

data_w_operacion: (DataFrame) data with the corresponding operacion (Entrada/Salida).
    path: C:\Users\ing_l\Tesis grado\Data\SUMO_data_w_operacion.csv

old_data_idx: (DataFrame) last index for the old SUMO_data files
    path: C:\Users\ing_l\Tesis grado\Data\old_SUMO_data\SUMO_data_index.txt
'''

data_path = r'C:\Users\ing_l\Tesis grado\Data\SUMO_data.csv'

#Because we want to save all the data in the same file
data_w_operacion_path = r'C:\Users\ing_l\Tesis grado\Data\SUMO_data_w_operacion.csv'
    
old_data_idx_path = r'C:\Users\ing_l\Tesis grado\Data\old_SUMO_data\SUMO_data_index.txt'

In [3]:
#read the data from the data_path
#data = pd.read_csv(data_path, delimiter=',', parse_dates=[['fecha', 'tiempo']])
data = pd.read_csv(data_path, delimiter=',')
    
#Get the last row readed to start from there to read the data
data_columns = np.append(data.columns.values, 'operacion')

In [4]:
#data['fecha_tiempo'] = data[['fecha','tiempo']].apply(lambda x: (x['fecha'] + ' ' + x['tiempo']))
data['fecha_tiempo'] = data['fecha'] + ' ' + data['tiempo']
data['fecha_tiempo'] = data['fecha_tiempo'].apply(lambda x: pd.to_datetime(x, dayfirst=True))
data

Unnamed: 0,id_cuadra,direccion,fecha,tiempo,ocupacion,lugares_cuadra,ocupacion_max,dispon_parq,altas_bajas(dia),fecha_tiempo
0,2,General Pinto 545,27-11-19,13:58:37,12,27.0,18,green,137,2019-11-27 13:58:37
1,3,San Martín 452,27-11-19,13:58:49,9,19.0,13,green,53,2019-11-27 13:58:49
2,4,Rodriguez 552,27-11-19,13:59:10,10,28.0,14,green,97,2019-11-27 13:59:10
3,5,9 de Julio 441,27-11-19,13:59:38,6,26.0,16,green,121,2019-11-27 13:59:38
4,6,General Pinto 759,27-11-19,13:59:44,10,30.0,13,green,133,2019-11-27 13:59:44
...,...,...,...,...,...,...,...,...,...,...
46536,84,General Paz 845,09-12-19,10:02:39,4,18.0,4,green,4,2019-12-09 10:02:39
46537,89,Av. España 737,09-12-19,10:04:40,4,19.0,4,green,6,2019-12-09 10:04:40
46538,90,Av. España 843,09-12-19,10:03:55,1,15.0,1,green,1,2019-12-09 10:03:55
46539,97,Av. España 774,09-12-19,10:05:11,3,,3,green,5,2019-12-09 10:05:11


In [5]:
#Sort values so it will be easy to see how to filter the data
data = data.sort_values(['id_cuadra','fecha_tiempo'])

#Drop duplicates if exists (not usefull data)
data.drop_duplicates(['id_cuadra','fecha_tiempo'], keep='first', inplace=True)

#Only maintain the green (usefull) values
data = data.loc[data['dispon_parq'] == 'green'].reset_index(drop=True)

In [6]:
def create_row_w_operation(row, operacion, ocupacion=None, time=None, infor=False):
    '''
    Create a new row who is a copy of the actual row with a new column
    called operacion, a new fecha_tiempo if time!=None.
    If you are in a loop put infor=True except for one value.
    Put the current ocupacion of the street if infor=True, otherwise 
    can return unexpected results.
    
    Parameters:
        row: (Series or DataFrame)current row to add a new column called operacion
        operacion: (None, Entrada, Salida) the operacion we will add to the row
        ocupacion: (int) if infor=True and ocupacion!=None the new row will contain this data
        time: (timestamp) new time to be setted in fecha_tiempo of the new row
        infor: (boolean) if you are in a for or not, put it false in the first or last iteration
            of the loop.
            
    Return:
        new_row: (Series or DataFrame, depends on row) the new row with the needed data
    '''
    
    new_row = row.copy()
    new_row['operacion'] = operacion
    #If hours is a value, update it
    if time != None:
        new_row['fecha_tiempo'] = time
    if infor == True:
        if ocupacion != None:
            if operacion == 'Entrada':
                ocupacion = ocupacion + 1
                new_row['ocupacion'] = ocupacion
            elif operacion == 'Salida':
                ocupacion = ocupacion - 1
                new_row['ocupacion'] = ocupacion
            return new_row, ocupacion
    return new_row


def get_aproximate_time(time1, time2, div, it):
    '''
    Heuristically get the time of the data that doesnt have time
    
    Parameters:
        time1: (Timestamp) the time of the first row (less than time2)
        time2: (Timestamp) the time of the second row (more than time1)
        div: (int) total range of the loop 
        it: (int) number of iteration in the loop
    
    Returns:
        new_time: the new time for the new row
    '''
    
    #To get the difference in minutes from time1 and time2. 
    times_subs = time2 - time1
    times_subs = times_subs / np.timedelta64(1,'m')
    
    #Get only the fraction to subs and multiply by the number of iteration
    fraction_to_subs = times_subs / div
    to_subs = int(fraction_to_subs * (div-it))
    
    #Substract the minutes to subs
    if time2.minute-to_subs > 0:
        new_time = time2.replace(minute=time2.minute-to_subs)
    else:
        new_time = time2.replace(minute=0, second=0)
    return new_time


def get_data_w_operacion():
    '''
    Process and returns the data with their operacion seen
    in the DATA_sumo.csv
    
    Returns:
        out_data: (DataFrame) the data processed with their operation
    '''
    
    #All the columns from data plus the column 'operacion'
    data_columns = np.append(data.columns.values, 'operacion')

    #Initialize out_data (all the trusted rows) and last_row (last row we've visited)
    out_data = pd.DataFrame(columns=data_columns)
    last_row = pd.DataFrame(columns=data_columns)
    
    #Iterate over all the preprocessed data
    for i, row in data.iterrows():        
        
        #Only usefull inside the for
        operacion = None
        
        #If last row is empty we are in a new street (a new id_cuadra)
        if last_row.empty:
            last_row = create_row_w_operation(row, None)
            out_data = out_data.append(last_row)
        
        #We are seeing rows representing the same street
        else:
            
            #If last_row and row have a different id_cuadra means that we need
            #to restart because we are now in a different street
            if last_row['id_cuadra'] != row['id_cuadra']:
                last_row = create_row_w_operation(row, None)
                out_data = out_data.append(last_row) 
                
            #Now we are in the same street (in last_row and row)
            else:
                
                #Heuristic to have different time in each of the rows created in the for
                first_time = last_row['fecha_tiempo']
                last_time = row['fecha_tiempo']
                
                #We will need it in case we have for > 1
                last_ocup = last_row['ocupacion']
                
                #For every difference in ocupacion...
                #e.g. last_row[ocup] = 6, row[ocup] = 3
                #we iterate 3 times and add 3 new rows with operacion Salida
                dif_in_ocupacion = abs(int(last_row['ocupacion'])-int(row['ocupacion'])) + 1
                for i in range(1, dif_in_ocupacion):
                    
                    #Get the aproximate time for the new row (in case we create one, for > 1)
                    aprox_time = get_aproximate_time(first_time, last_time, dif_in_ocupacion, i)
                    
                    #If now we have more ocupacion than before, we have an Entrada
                    if int(last_row['ocupacion']) < int(row['ocupacion']):
                        operacion = 'Entrada'

                    #If now we have less ocupacion than before, we have a Salida
                    elif int(last_row['ocupacion']) > int(row['ocupacion']):
                        operacion = 'Salida'
                        
                    #For each row we create in the for (for > 1) we change the hour
                    #of the row to have better data
                    if i < abs(int(last_row['ocupacion'])-int(row['ocupacion'])):
                        aprox_time_row, last_ocup = create_row_w_operation(last_row, operacion, ocupacion=last_ocup, time=aprox_time, infor=True)
                    #If we are in the last iteration of the for or we havent
                    #created any row, we add it without changing it ocupacion
                    #neither hora_fecha
                    else:
                        aprox_time_row = create_row_w_operation(row, operacion, infor=False)
                    out_data = out_data.append(aprox_time_row)
                last_row = create_row_w_operation(row, operacion)

    return (out_data)

In [7]:
print('Processing all the data...')

data_w_operacion = get_data_w_operacion()

print('Job complete!')

Processing all the data...
Job complete!


In [8]:
print('Saving the new trusted data and the last readed row...')

#If exists we dont want to overwrite it, so we append the new data
if os.path.isfile(data_w_operacion_path):
    data_w_operacion.to_csv(data_w_operacion_path, index=False, mode='a', header=False)
else: #If it doesnt exists we create it
    data_w_operacion.to_csv(data_w_operacion_path, index=False, header=data_w_operacion.columns.values)

print('Saved succesfully!')

Saving the new trusted data and the last readed row...
Saved succesfully!


In [9]:
#Saving the index of the new SUMO_data.csv file, to move it into a new folder.
if not os.path.isfile(old_data_idx_path):
    f = open(old_data_idx_path, "w+")
    f.write('0')
    f.close()

In [10]:
f = open(old_data_idx_path, "r")
old_data_index = f.read()
f.close()

In [11]:
#move into another folder
os.rename(data_path, 
          r'C:\Users\ing_l\Tesis grado\Data\old_SUMO_data\SUMO_data_' + old_data_index + '.csv')

In [12]:
#save the new index (old + 1)
f = open(old_data_idx_path, 'w')
old_data_index = str(int(old_data_index) + 1)
f.write(old_data_index)
f.close()

In [13]:
aux = pd.read_csv(data_w_operacion_path)
aux

Unnamed: 0,id_cuadra,direccion,fecha,tiempo,ocupacion,lugares_cuadra,ocupacion_max,dispon_parq,altas_bajas(dia),fecha_tiempo,operacion
0,2,General Pinto 545,11-09-19,17:15:31,6,27.0,17,green,184,2019-09-11 17:15:31,
1,2,General Pinto 545,11-09-19,17:20:25,7,27.0,17,green,184,2019-09-11 17:20:25,Entrada
2,2,General Pinto 545,11-09-19,17:34:54,6,27.0,17,green,190,2019-09-11 17:34:54,Salida
3,2,General Pinto 545,11-09-19,17:40:09,5,27.0,17,green,191,2019-09-11 17:40:09,Salida
4,2,General Pinto 545,11-09-19,17:45:05,6,27.0,17,green,192,2019-09-11 17:45:05,Entrada
...,...,...,...,...,...,...,...,...,...,...,...
264063,99,Av. España 970,09-12-19,08:55:58,1,,1,green,1,2019-12-09 08:55:58,Entrada
264064,99,Av. España 970,09-12-19,09:24:34,2,,2,green,2,2019-12-09 09:24:34,Entrada
264065,99,Av. España 970,09-12-19,09:43:53,3,,3,green,3,2019-12-09 09:43:53,Entrada
264066,99,Av. España 970,09-12-19,09:49:53,4,,4,green,4,2019-12-09 09:49:53,Entrada
