In [1]:
import urllib.request
from bs4 import BeautifulSoup
import json
import os
import pandas as pd
import time

In [2]:
def save_json_data(url):
    '''
    Process and save the necessary information from the
    json contained in the url (needs to be a gpssumo.com json)
    
    Parameters:
        url: needs to be http://gpssumo.com/parquimetros/get_PA/ + some index, 
            e.g. id_cuadra
    
    Returns:
        Same as process_json_data
    '''
    #Obtain the acces to the html
    html = urllib.request.urlopen('http://gpssumo.com/parquimetros/get_PA/id_cuadra')
    #Extract the text from the html
    plain_data = BeautifulSoup(html, 'lxml').text

    #Replace all the ' for " cause the json cant have '
    jsoned_data = plain_data.replace('\'',"\"")
    #Convert string data to json 
    json_data = json.loads(jsoned_data)
    
    process_json_data(json_data)

In [3]:
def process_json_data(json_data):
    '''
    Process the necessary information from the
    json data.
    It uses the last_data for all rows and the
    filtered data, because we dont want repeated
    information.
    
    Parameters:
        json_data: json data from http://gpssumo.com/parquimetros/get_PA/ + some index, 
            e.g. id_cuadra
    
    Returns:
        Same as process_json_data
    '''
    
    #Cause we want those global variables to modify them and
    #actualize them all the time
    global last_data, filtered_data
    
    #This will be processing repeateadly
    for i, data in enumerate(json_data):        
        #Getting the possible new data
        possible_new_data = {'id_cuadra': data['id_cuadra'], 
                             'direccion': data['direccion'],
                             'fecha': data['fecha_a'],
                             'tiempo': data['hora_a'],
                             'ocupacion': data['ocupacion'],
                             'lugares_cuadra': data['lugares_cuadra'],
                             'ocupacion_max': data['ocupacion_max'],
                             'dispon_parq': data['color'],
                             'altas_bajas(dia)': data['trans_prk_dia']}

        #Getting the id_cuadra, only for comparison reasons
        id_cuadra = str(possible_new_data['id_cuadra'])

        #obtaining the last data (ocupacion) of current id_cuadra
        actual_last_data = last_data.loc[last_data.id_cuadra == id_cuadra, 'ocupacion']

        append_data = False
        #if the last data for the current id_cuadra is empty then we must append
        if not actual_last_data.empty:
            #if both have different value we must append
            if not (actual_last_data == possible_new_data['ocupacion']).any():
                append_data = True
        else:
            append_data = True

        if append_data:
            #overwriting the existing (or not) value for id_cuadra from last_data
            last_data = last_data[last_data.id_cuadra != id_cuadra]
            last_data = last_data.append(possible_new_data, ignore_index=True)

            #Here we need to put the possible new data to a file instead of append it to filtered_data
            filtered_data = filtered_data.append(possible_new_data, ignore_index=True) 

In [4]:
url = 'http://gpssumo.com/parquimetros/get_PA/id_cuadra'
out_path = r'SUMO_data.csv'

In [5]:
#Initialize the variables used by the scraper
#filtered_data is the output data
filtered_data = pd.DataFrame(columns=['id_cuadra','direccion','fecha','tiempo','ocupacion','lugares_cuadra','ocupacion_max','dispon_parq','altas_bajas(dia)'])

#the new readed data
possible_new_data = pd.DataFrame(columns=['id_cuadra','direccion','fecha','tiempo','ocupacion','lugares_cuadra','ocupacion_max','dispon_parq','altas_bajas(dia)'])

#last_data is the last data apended to filtered_data, this is used to have
#only one copy of the data in filtered_data
last_data = pd.DataFrame(columns=['id_cuadra','direccion','fecha','tiempo','ocupacion','lugares_cuadra','ocupacion_max','dispon_parq','altas_bajas(dia)'])
if os.path.isfile(out_path): # if file does exist get the data from the csv file 
    csv_data = pd.read_csv(out_path, delimiter=',')
    last_data = csv_data.groupby('id_cuadra').last().reset_index()

In [6]:
i = 0
internet_on = True
while True:
    #get the actual time to sleep only 1 minute    
    starttime = time.time()
    
    print('Iteration: ', i)
    i+=1
    #try/catch to avoid no internet conection or any other problem
    try:
        #append the new data to the filtered_data var we want to write in the file
        print('Trying to save some data...')
        save_json_data(url)
        if filtered_data is not None:
            if not os.path.isfile(out_path): # if file does not exist write header 
                filtered_data.to_csv(out_path, index=False, header=filtered_data.columns.values)
                filtered_data = filtered_data[0:0]
            else: # else it exists so append without writing the header
                filtered_data.to_csv(out_path, index=False, mode='a', header=False)
                filtered_data = filtered_data[0:0]
        print('Data saved succesfully!')
        
    except:
        #If we have no internet conection (or any other problem) then sleep the process for one minute
        print('Having some kind of problem (maybe with the internet), huh? Ill sleep, cya')
        
    #sleep for 1 minute
    time.sleep(60.0 - ((time.time() - starttime) % 60.0))

Iteration:  0
Trying to save some data...
Data saved succesfully!
Iteration:  1
Trying to save some data...
Data saved succesfully!
Iteration:  2
Trying to save some data...
Data saved succesfully!
Iteration:  3
Trying to save some data...
Data saved succesfully!
Iteration:  4
Trying to save some data...
Data saved succesfully!
Iteration:  5
Trying to save some data...
Data saved succesfully!
Iteration:  6
Trying to save some data...
Data saved succesfully!
Iteration:  7
Trying to save some data...
Data saved succesfully!
Iteration:  8
Trying to save some data...
Data saved succesfully!
Iteration:  9
Trying to save some data...
Data saved succesfully!
Iteration:  10
Trying to save some data...
Data saved succesfully!
Iteration:  11
Trying to save some data...
Data saved succesfully!
Iteration:  12
Trying to save some data...
Data saved succesfully!
Iteration:  13
Trying to save some data...
Data saved succesfully!
Iteration:  14
Trying to save some data...
Data saved succesfully!
Itera

KeyboardInterrupt: 