In [1]:
import csv
import json
import datetime
import requests
import pandas as pd

In [4]:
STARTING_DATE = '2022-10-12' # The date when the scraping started
ENDING_DATE = '2022-10-31' # the date when the scraping ended
DOWNLOADED_VARIABLES = [ 15, 18, 26, 21 ] # avg temp of air, avg humidity, precipitation, avg wind speed
DOWNLOAD_LOCATION = './data/'
LOCATION_ID = 1828 # Ljubljana Bežigrad

In [3]:
variables_string = ','.join(str(variable) for variable in DOWNLOADED_VARIABLES)

query_parameters = {'d1': STARTING_DATE, 'd2': ENDING_DATE, "lang": 'si',
                    'vars': variables_string, 'group': 'halfhourlyData0', 'type': 'halfhourly', 'id': LOCATION_ID}

query_parameters

{'d1': '2022-10-12',
 'd2': '2022-10-31',
 'lang': 'si',
 'vars': '15,18,26,21',
 'group': 'halfhourlyData0',
 'type': 'halfhourly',
 'id': 1828}

In [4]:
data_request_url = "https://meteo.arso.gov.si/webmet/archive/data.xml"
data_xml = requests.get(data_request_url, params=query_parameters)
data_xml.encoding = 'utf-8'

data_string = data_xml.text

len(data_string)

# write the string to temp file
with open('temp.xml', 'w', encoding='utf-8') as f:
    f.write(data_string)

The following part is rather ugly

### Helper functions

In [5]:
def ugly_fix_for_json(json_string):
    characters_to_skip = ['{', '}', ',', ':', '"', ' ', '\\', "'", '[', ']']
    i = 0
    while i < len(json_string):
        if (json_string[i] not in characters_to_skip):
            json_string = json_string[0:i] + '"' + json_string[i:]
            i += 1
            while (json_string[i] not in characters_to_skip):
                i += 1
            json_string = json_string[0:i] + '"' + json_string[i:]
        elif (json_string[i] == '"'):
            i += 1
            while (json_string[i] != '"'):
                i += 1
        i += 1
    return json_string


def ugly_fix_time(time_string):
    time_number = int(time_string[1:])
    # 2022-12-31 00:00 - 117285120
    known_time_coded = 117285120
    known_time_dt = datetime.datetime(2022, 12, 31, 0, 0, 0)
    time_difference = time_number - known_time_coded  # minutes
    new_date = known_time_dt + datetime.timedelta(minutes=time_difference)
    # format date as yyyy-mm-dd-hh-mm
    return new_date.strftime("%Y-%m-%d-%H-%M")

In [6]:
all_data_points = []

index_start = data_string.index('CDATA[AcademaPUJS.set(')
index_end = data_string.index(')]]></pujs>')
# this is ugly
data_string = data_string[index_start+22:index_end]
data_string = ugly_fix_for_json(data_string)
data_string = data_string.replace("\"'\"", "'")

data_value = json.loads(data_string)
data_value

{'baseurl': './icons/',
 'gen': "ACADEMA.AEP.getGenerators('archive').genTerminData('yyyy-MM-dd HH:mm')",
 'datatype': 'halfhourly',
 'o': ['p0', 'p1', 'p2', 'p3'],
 'params': {'p0': {'pid': '15',
   'name': 't2m',
   's': 'povp. T',
   'l': 'povprečna temperatura zraka na 2m (°C)',
   'unit': '°C'},
  'p1': {'pid': '18',
   'name': 'rh',
   's': 'povp. rel. vla.',
   'l': 'povprečna relativna vlaga (%)',
   'unit': '%'},
  'p2': {'pid': '26',
   'name': 'padavine',
   's': 'količina padavin',
   'l': 'količina padavin (mm)',
   'unit': 'mm'},
  'p3': {'pid': '21',
   'name': 'veter_hitrost',
   's': 'hitrost vetra',
   'l': 'povprečna hitrost vetra (m/s)',
   'unit': 'm/s'}},
 'points': {'_1828': {'_117169920': {'p0': '14',
    'p1': '86',
    'p2': '0',
    'p3': '0.7'},
   '_117169930': {'p0': '14', 'p1': '86', 'p3': '0.4'},
   '_117169940': {'p0': '14.1', 'p1': '86', 'p3': '0.7'},
   '_117169950': {'p0': '14', 'p1': '86', 'p2': '0', 'p3': '0.6'},
   '_117169960': {'p0': '13.9', 'p1

In [7]:
parameters = data_value['params']
parameters

{'p0': {'pid': '15',
  'name': 't2m',
  's': 'povp. T',
  'l': 'povprečna temperatura zraka na 2m (°C)',
  'unit': '°C'},
 'p1': {'pid': '18',
  'name': 'rh',
  's': 'povp. rel. vla.',
  'l': 'povprečna relativna vlaga (%)',
  'unit': '%'},
 'p2': {'pid': '26',
  'name': 'padavine',
  's': 'količina padavin',
  'l': 'količina padavin (mm)',
  'unit': 'mm'},
 'p3': {'pid': '21',
  'name': 'veter_hitrost',
  's': 'hitrost vetra',
  'l': 'povprečna hitrost vetra (m/s)',
  'unit': 'm/s'}}

In [8]:
points = data_value['points']['_'+str(LOCATION_ID)]
points

{'_117169920': {'p0': '14', 'p1': '86', 'p2': '0', 'p3': '0.7'},
 '_117169930': {'p0': '14', 'p1': '86', 'p3': '0.4'},
 '_117169940': {'p0': '14.1', 'p1': '86', 'p3': '0.7'},
 '_117169950': {'p0': '14', 'p1': '86', 'p2': '0', 'p3': '0.6'},
 '_117169960': {'p0': '13.9', 'p1': '87', 'p3': '0.7'},
 '_117169970': {'p0': '13.8', 'p1': '87', 'p3': '0.9'},
 '_117169980': {'p0': '13.6', 'p1': '88', 'p2': '0', 'p3': '0.3'},
 '_117169990': {'p0': '13.4', 'p1': '88', 'p3': '0.6'},
 '_117170000': {'p0': '13.1', 'p1': '89', 'p3': '0.8'},
 '_117170010': {'p0': '13', 'p1': '89', 'p2': '0', 'p3': '1'},
 '_117170020': {'p0': '12.8', 'p1': '90', 'p3': '1'},
 '_117170030': {'p0': '12.7', 'p1': '90', 'p3': '1'},
 '_117170040': {'p0': '12.5', 'p1': '91', 'p2': '0', 'p3': '1.1'},
 '_117170050': {'p0': '12.4', 'p1': '92', 'p3': '1.3'},
 '_117170060': {'p0': '12.2', 'p1': '92', 'p3': '0.6'},
 '_117170070': {'p0': '11.9', 'p1': '92', 'p2': '0', 'p3': '0.4'},
 '_117170080': {'p0': '11.9', 'p1': '93', 'p3': '1'}

In [9]:
# create a pandas df where we will have 5 columns, one for each variable and the time
df = pd.DataFrame(columns=['time', 'temperature at 2m', 'relative humidity', 'precipitation', 'wind_speed'])

# for each point in points, we will add a row to the df
previous_temperature = 'null'
previous_relative_humidity = 'null'
previous_precipitation = 'null'
previous_wind_speed = 'null'

for point_ugly_time in points:
    time = ugly_fix_time(point_ugly_time)
    try:
        temperature_at_2m = points[point_ugly_time]['p0']
    except KeyError:
        temperature_at_2m = previous_temperature
        
    try:
        relative_humidity = points[point_ugly_time]['p1']
    except KeyError:
        relative_humidity = previous_relative_humidity
    
    try:
        precipitation = points[point_ugly_time]['p2']
    except KeyError:
        precipitation = previous_precipitation

    try:
        wind_speed = points[point_ugly_time]['p3']
    except KeyError:
        wind_speed = previous_wind_speed

    previous_temperature = temperature_at_2m
    previous_relative_humidity = relative_humidity
    previous_precipitation = precipitation
    previous_wind_speed = wind_speed

    # use concat
    df = pd.concat([df, pd.DataFrame([[time, temperature_at_2m, relative_humidity, precipitation, wind_speed]], columns=['time', 'temperature at 2m', 'relative humidity', 'precipitation', 'wind_speed'])], ignore_index=True)

df

Unnamed: 0,time,temperature at 2m,relative humidity,precipitation,wind_speed
0,2022-10-12-00-00,14,86,0,0.7
1,2022-10-12-00-10,14,86,0,0.4
2,2022-10-12-00-20,14.1,86,0,0.7
3,2022-10-12-00-30,14,86,0,0.6
4,2022-10-12-00-40,13.9,87,0,0.7
...,...,...,...,...,...
2875,2022-10-31-23-10,10.3,96,0,0.4
2876,2022-10-31-23-20,10.2,96,0,0.5
2877,2022-10-31-23-30,10.1,96,0,0.5
2878,2022-10-31-23-40,10,96,0,0.7


In [10]:
# dump the data tp weather_data.csv with a : as a separator and utf 8
df.to_csv(DOWNLOAD_LOCATION + 'weather_data.csv', sep=';', encoding='utf-8', index=False)

In [5]:
import pandas as pd

# read the data from weather_data.csv, and just copy it back in but extrapolate for each 5 minute interval
df = pd.read_csv(DOWNLOAD_LOCATION + 'weather_data.csv', sep=';', encoding='utf-8')
df

Unnamed: 0,time,temperature at 2m,relative humidity,precipitation,wind_speed
0,2022-10-12-00-00,14.0,86,0.0,0.7
1,2022-10-12-00-10,14.0,86,0.0,0.4
2,2022-10-12-00-20,14.1,86,0.0,0.7
3,2022-10-12-00-30,14.0,86,0.0,0.6
4,2022-10-12-00-40,13.9,87,0.0,0.7
...,...,...,...,...,...
2875,2022-10-31-23-10,10.3,96,0.0,0.4
2876,2022-10-31-23-20,10.2,96,0.0,0.5
2877,2022-10-31-23-30,10.1,96,0.0,0.5
2878,2022-10-31-23-40,10.0,96,0.0,0.7
