In [1]:
import pandas as pd
import json
import requests
import datetime

In [2]:
def refactor_values(data):
    new_data = {}
    for key, value in data.items():
            if value != '':
                new_data[key] = value
            if isinstance(value, str) and '<' in value:
                new_value = value.split('<')[1]
                new_data[key] = int(new_value)
    return new_data

In [3]:
def fetch_data(src):
    url = "https://arsoxmlwrapper.app.grega.xyz/api/air/archive"
    response = requests.get(url)
    if response.status_code == 200:
        print("Fetched main datset")
        data = json.loads(response.content)
        with open(src, "w") as f:
            json.dump(data, f)
    else:
        print("Failed to retrieve JSON data")

In [16]:
def fetch_other_data(src1):
    # Get the current date and time
    current_date = datetime.datetime.now()

    # Get the date one month ago
    one_month_ago = current_date - datetime.timedelta(days=60)

    # Convert dates to Unix time
    current_unix_time = int(current_date.timestamp())
    one_month_ago_unix_time = int(one_month_ago.timestamp())

    lat = '46.5547222'
    lon = '15.6466667'

    end_date = datetime.datetime.utcfromtimestamp(
        current_unix_time).strftime('%Y-%m-%d')
    start_date = datetime.datetime.utcfromtimestamp(
        one_month_ago_unix_time).strftime('%Y-%m-%d')

    url = f'https://archive-api.open-meteo.com/v1/archive?latitude={lat}&longitude={lon}&start_date={start_date}&end_date={end_date}&hourly=temperature_2m,relativehumidity_2m,precipitation,windspeed_10m'
    response = requests.get(url)
    if response.status_code == 200:
        print("Fetched weather history")
        data = json.loads(response.content)
        with open(src1, "w") as f:
            json.dump(data, f)
    else:
        print("Failed to retrieve JSON data")

In [17]:
fetch_data('../data/raw/data.json')
fetch_other_data('../data/raw/weather/data.json')

Fetched main datset
Fetched weather history


In [18]:
src = '../data/raw/data.json'
src1 = '../data/raw/weather/data.json'
dist = '../data/processed/data.csv'

In [28]:
f = open(src, 'r', encoding='utf-8')
raw = json.load(f)
f.close()

df = pd.DataFrame()

print('Transforming json to pandas dataframe...')
# prilagodimo json dataframe-u
for i in range(len(raw)):
    jdata = json.loads(raw[i]['json'])
    station = jdata['arsopodatki']['postaja']
    for i in range(len(station)):
        if station[i]['merilno_mesto'] == 'MB Titova':
            data = station[i]
            data = refactor_values(data)
            df = pd.concat([df, pd.json_normalize(data)])

print('Connecting data...')

df = df[['datum_od', 'pm10']]
df['pm10'].fillna((df['pm10'].mean()), inplace=True)
df['datum_od'] = pd.to_datetime(df['datum_od'])
df = df.sort_values(by='datum_od')
df = df.drop_duplicates(subset='datum_od', keep='first')

Transforming json to pandas dataframe...
Connecting data...


In [29]:
f = open(src1, 'r', encoding='utf-8')
raw = json.load(f)
f.close()

df1 = pd.DataFrame()
df1['date'] = raw['hourly']['time']
df1['date'] = pd.to_datetime(df1['date'])

df1['temp'] = raw['hourly']['temperature_2m']
df1['temp'].fillna(df1['temp'].mean(), inplace=True)

df1['hum'] = raw['hourly']['relativehumidity_2m']
df1['hum'].fillna(df1['hum'].mean(), inplace=True)

df1['percp'] = raw['hourly']['precipitation']
df1['percp'].fillna(df1['percp'].mean(), inplace=True)

df1['wspeed'] = raw['hourly']['windspeed_10m']
df1['wspeed'].fillna(df1['wspeed'].mean(), inplace=True)

In [21]:
df1


Unnamed: 0,date,temp,hum,percp,wspeed
0,2023-01-19 00:00:00,1.30000,99.00000,0.200000,10.100000
1,2023-01-19 01:00:00,1.50000,100.00000,0.700000,10.100000
2,2023-01-19 02:00:00,1.70000,99.00000,1.000000,13.000000
3,2023-01-19 03:00:00,1.60000,98.00000,1.000000,15.100000
4,2023-01-19 04:00:00,1.70000,87.00000,0.600000,17.000000
...,...,...,...,...,...
1459,2023-03-20 19:00:00,2.90216,75.62963,0.077778,9.361728
1460,2023-03-20 20:00:00,2.90216,75.62963,0.077778,9.361728
1461,2023-03-20 21:00:00,2.90216,75.62963,0.077778,9.361728
1462,2023-03-20 22:00:00,2.90216,75.62963,0.077778,9.361728


In [25]:
df

Unnamed: 0,datum_od,pm10,date
0,2023-02-15 19:00,56.0,2023-02-15 19:00:00
0,2023-02-15 20:00,56.0,2023-02-15 20:00:00
0,2023-02-15 21:00,49.0,2023-02-15 21:00:00
0,2023-02-15 22:00,45.0,2023-02-15 22:00:00
0,2023-02-15 23:00,43.0,2023-02-15 23:00:00
...,...,...,...
0,2023-03-20 12:00,39.0,2023-03-20 12:00:00
0,2023-03-20 13:00,38.0,2023-03-20 13:00:00
0,2023-03-20 14:00,39.0,2023-03-20 14:00:00
0,2023-03-20 15:00,35.0,2023-03-20 15:00:00


In [30]:
start = df['datum_od'].iloc[0]
end = df['datum_od'].iloc[-1]

start_index = df1.loc[df1['date'] == start].index[0]
end_index = df1.loc[df1['date'] == end].index[0]

print(start_index, end_index)

df1 = df1.iloc[start_index:end_index]



667 1456


In [22]:
df1

Unnamed: 0,date,temp,hum,percp,wspeed
187,2023-02-15 19:00:00,1.600000,88.000000,0.00000,6.800000
188,2023-02-15 20:00:00,0.200000,90.000000,0.00000,6.800000
189,2023-02-15 21:00:00,-0.300000,90.000000,0.00000,6.600000
190,2023-02-15 22:00:00,-0.900000,90.000000,0.00000,5.200000
191,2023-02-15 23:00:00,-1.300000,91.000000,0.00000,5.800000
...,...,...,...,...,...
723,2023-03-10 03:00:00,3.633854,75.263889,0.04184,8.096181
724,2023-03-10 04:00:00,3.633854,75.263889,0.04184,8.096181
725,2023-03-10 05:00:00,3.633854,75.263889,0.04184,8.096181
726,2023-03-10 06:00:00,3.633854,75.263889,0.04184,8.096181


In [28]:
df1 = df1.reset_index(drop=True)
df = df.reset_index(drop=True)
df1

Unnamed: 0,date,temp,hum,percp,wspeed
0,2023-02-15 19:00:00,1.600000,88.000000,0.00000,6.800000
1,2023-02-15 20:00:00,0.200000,90.000000,0.00000,6.800000
2,2023-02-15 21:00:00,-0.300000,90.000000,0.00000,6.600000
3,2023-02-15 22:00:00,-0.900000,90.000000,0.00000,5.200000
4,2023-02-15 23:00:00,-1.300000,91.000000,0.00000,5.800000
...,...,...,...,...,...
536,2023-03-10 03:00:00,3.633854,75.263889,0.04184,8.096181
537,2023-03-10 04:00:00,3.633854,75.263889,0.04184,8.096181
538,2023-03-10 05:00:00,3.633854,75.263889,0.04184,8.096181
539,2023-03-10 06:00:00,3.633854,75.263889,0.04184,8.096181


In [33]:
df1['pm10'] = df.loc[:, 'pm10']
df1

Unnamed: 0,date,temp,hum,percp,wspeed,pm10
0,2023-02-15 19:00:00,1.600000,88.000000,0.00000,6.800000,56.0
1,2023-02-15 20:00:00,0.200000,90.000000,0.00000,6.800000,56.0
2,2023-02-15 21:00:00,-0.300000,90.000000,0.00000,6.600000,49.0
3,2023-02-15 22:00:00,-0.900000,90.000000,0.00000,5.200000,45.0
4,2023-02-15 23:00:00,-1.300000,91.000000,0.00000,5.800000,43.0
...,...,...,...,...,...,...
536,2023-03-10 03:00:00,3.633854,75.263889,0.04184,8.096181,15.0
537,2023-03-10 04:00:00,3.633854,75.263889,0.04184,8.096181,17.0
538,2023-03-10 05:00:00,3.633854,75.263889,0.04184,8.096181,15.0
539,2023-03-10 06:00:00,3.633854,75.263889,0.04184,8.096181,22.0


In [34]:

print('Saving processed data...')
df1.to_csv(dist, index=False)

print('Finished!')


#print(df.isnull().sum())

Saving processed data...
Finished!
