In [159]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import requests
import json
import datetime as dt

# TO DO

- Fix functions, combine cleaning with processing, simplify.
- Make processing and harmonizing file as module

In [160]:
CURR_PATH = os.getcwd()
TARGET_PATH_SMHI = os.path.join(CURR_PATH, 'smhi_data')
TARGET_PATH_MET = os.path.join(CURR_PATH, 'met_data')

In [161]:
locations = [
    {'lat': 57.7, 'lon': 11.9}, # Göteborg
    {'lat': 55.6, 'lon': 13.0}, # Malmö
    {'lat': 59.9, 'lon': 10.7}, # Oslo
    {'lat': 59.3, 'lon': 18.0}, # Stockholm
    {'lat': 60.1, 'lon': 24.9} # Helsinki
]

In [162]:
def _create_smhi_url(lat, lon ):
    return f"https://opendata-download-metfcst.smhi.se/api/category/pmp3g/version/2/geotype/point/lon/{lon}/lat/{lat}/data.json"


def _create_met_url(lat, lon):
    return f'https://api.met.no/weatherapi/locationforecast/2.0/compact?lat={lat}&lon={lon}'

def get_smhi_data(lat, lon):
    smhi_url = _create_smhi_url(lat, lon)
    smhi_response = requests.get(smhi_url)
    
    if smhi_response.status_code == 200:
        smhi_data = smhi_response.json()
        print('SMHI data retrieved successfully')
        return smhi_data
    else:
        print(f"Failed to retrieve SMHI data: HTTP {smhi_response.status_code}")
        return None

def get_met_data(lat, lon):
    met_url = _create_met_url(lat, lon)
    headers = {'User-Agent': 'weather@brights.com'}  # Ensure you add a User-Agent
    met_response = requests.get(met_url, headers=headers)

    if met_response.status_code == 200:
        met_data = met_response.json()
        print('MET data retrieved successfully')
        return met_data
    else:
        print(f"Failed to retrieve MET data: HTTP {met_response.status_code}")
        return None

In [163]:
def fetch_api_data():
    for location in locations:
        lat = location['lat']
        lon = location['lon']
        
        smhi_data = get_smhi_data(lat, lon)
        if smhi_data:
            with open(TARGET_PATH_SMHI, 'a') as f:
                json.dump(smhi_data, f)

        met_data = get_met_data(lat, lon)
        if met_data:
            with open(TARGET_PATH_MET, 'a') as f:
                json.dump(met_data, f)

    return smhi_data, met_data

In [190]:
def process_smhi_weather_data(weather_data):
    
    time_series = weather_data['timeSeries']

    processed_data = []

    for time_entry in time_series:
        valid_time = time_entry['validTime']

        parameters = time_entry['parameters']

        processed_entry = {'valid_time': valid_time}

        for parameter in parameters:
            name = parameter['name']
            values = parameter['values'] # List of values
            values = tuple(parameter['values']) # Convert to tuple
            values = values[0]
            

            processed_entry[name] = values

        processed_data.append(processed_entry)

    smhi_df = pd.DataFrame(processed_data)
    

    return smhi_df

def process_met_weather_data(weather_data):

    met_df = pd.json_normalize(weather_data['properties']['timeseries'])

    met_df['valid_time'] = pd.to_datetime(met_df['time'])
    met_df = met_df.drop(columns=['time'])

    # met_df = met_df.set_index('valid_time')
    

   
    return met_df

In [165]:
# met_df['hour'] = met_df['valid_time'].dt.hour
# met_df['day_of_week'] = met_df['valid_time'].dt.dayofweek

# smhi_df['hour'] = smhi_df['valid_time'].dt.hour
# smhi_df['day_of_week'] = smhi_df['valid_time'].dt.dayofweek

In [166]:
smhi_data, met_data = fetch_api_data() # DONT RUN EVERYTIME

In [191]:
met_df = process_met_weather_data(met_data)
smhi_df = process_smhi_weather_data(smhi_data)

In [192]:
def clean_met_data(met_df):

    met_df = met_df.drop(columns=['data.next_12_hours.summary.symbol_code',
                                'data.next_1_hours.summary.symbol_code',
                                'data.next_1_hours.details.precipitation_amount',
                                'data.next_6_hours.summary.symbol_code',
                                'data.next_6_hours.details.precipitation_amount'])
    
    return met_df

def clean_smhi_data(smhi_df):
    weather_mapping = {
    1: 'Clear sky',
    2: 'Nearly clear sky',
    3: 'Variable cloudiness',
    4: 'Halfclear sky',
    5: 'Cloudy sky',
    6: 'Overcast',
    7: 'Fog',
    8: 'Light rain showers',
    9: 'Moderate rain showers',
    10: 'Heavy rain showers',
    11: 'Thunderstorm',
    12: 'Light sleet showers',
    13: 'Moderate sleet showers',
    14: 'Heavy sleet showers',
    15: 'Light snow showers',
    16: 'Moderate snow showers',
    17: 'Heavy snow showers',
    18: 'Light rain',
    19: 'Moderate rain',
    20: 'Heavy rain',
    21: 'Thunder',
    22: 'Light sleet',
    23: 'Moderate sleet',
    24: 'Heavy sleet',
    25: 'Light snowfall',
    26: 'Moderate snowfall',
    27: 'Heavy snowfall'
}

    smhi_df['weather_description'] = smhi_df['Wsymb2'].map(weather_mapping)

    smhi_df = smhi_df.drop(columns=['Wsymb2', 'Wsymb2'])

    smhi_df = smhi_df.rename(columns={
        't': 'temperature',
        'vis': 'visibility',
        'wd': 'wind_direction',
        'ws': 'wind_speed',
        'r': 'humidity',
        'tstm': 'thunderstorm_probability',
        'tcc_mean': 'total_cloud_cover',
        'lcc_mean': 'low_level_cloud_cover',
        'mcc_mean': 'medium_level_cloud_cover',
        'hcc_mean': 'high_level_cloud_cover',
        'gust': 'wind_gust_speed',
        'pmin': 'min_precipitation_intensity',
        'pmax': 'max_precipitation_intensity',
        'spp': 'precipitation_frozen_form',
        'pcat': 'precipitation_category',
        'pmean': 'mean_precipitation_intensity',
        'pmedian': 'median_precipitation_intensity'
    })

    smhi_df['valid_time'] = pd.to_datetime(smhi_df['valid_time'])
    # smhi_df = smhi_df.set_index('valid_time')

    return smhi_df


In [193]:
met_df = clean_met_data(met_df)
smhi_df = clean_smhi_data(smhi_df)

In [170]:
# Process data for each location
smhi_dfs = []
met_dfs = []

for location in locations:
    # lat = location['lat']
    # lon = location['lon']
    # smhi_df = process_smhi_weather_data(smhi_data)
    # met_df = process_met_weather_data(met_data)
    smhi_dfs.append(smhi_df)
    met_dfs.append(met_df)

# Concatenate all dataframes
smhi_combined_df = pd.concat(smhi_dfs, ignore_index=True)
met_combined_df = pd.concat(met_dfs, ignore_index=True)

# Merge the datasets
# merged_df = pd.merge(smhi_combined_df, met_combined_df, on=[''], suffixes=('_smhi', '_met'))

In [171]:
smhi_combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   precipitation_frozen_form       400 non-null    int64  
 1   precipitation_category          400 non-null    int64  
 2   min_precipitation_intensity     400 non-null    float64
 3   mean_precipitation_intensity    400 non-null    float64
 4   max_precipitation_intensity     400 non-null    float64
 5   median_precipitation_intensity  400 non-null    float64
 6   total_cloud_cover               400 non-null    int64  
 7   low_level_cloud_cover           400 non-null    int64  
 8   medium_level_cloud_cover        400 non-null    int64  
 9   high_level_cloud_cover          400 non-null    int64  
 10  temperature                     400 non-null    float64
 11  msl                             400 non-null    float64
 12  visibility                      400 

In [172]:
smhi_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 80 entries, 2024-08-20 07:00:00+00:00 to 2024-08-29 12:00:00+00:00
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   precipitation_frozen_form       80 non-null     int64  
 1   precipitation_category          80 non-null     int64  
 2   min_precipitation_intensity     80 non-null     float64
 3   mean_precipitation_intensity    80 non-null     float64
 4   max_precipitation_intensity     80 non-null     float64
 5   median_precipitation_intensity  80 non-null     float64
 6   total_cloud_cover               80 non-null     int64  
 7   low_level_cloud_cover           80 non-null     int64  
 8   medium_level_cloud_cover        80 non-null     int64  
 9   high_level_cloud_cover          80 non-null     int64  
 10  temperature                     80 non-null     float64
 11  msl                             80 non-null     f

In [173]:
met_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 86 entries, 2024-08-20 06:00:00+00:00 to 2024-08-30 06:00:00+00:00
Data columns (total 6 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   data.instant.details.air_pressure_at_sea_level  86 non-null     float64
 1   data.instant.details.air_temperature            86 non-null     float64
 2   data.instant.details.cloud_area_fraction        86 non-null     float64
 3   data.instant.details.relative_humidity          86 non-null     float64
 4   data.instant.details.wind_from_direction        86 non-null     float64
 5   data.instant.details.wind_speed                 86 non-null     float64
dtypes: float64(6)
memory usage: 4.7 KB


In [174]:
smhi_df.columns

Index(['precipitation_frozen_form', 'precipitation_category',
       'min_precipitation_intensity', 'mean_precipitation_intensity',
       'max_precipitation_intensity', 'median_precipitation_intensity',
       'total_cloud_cover', 'low_level_cloud_cover',
       'medium_level_cloud_cover', 'high_level_cloud_cover', 'temperature',
       'msl', 'visibility', 'wind_direction', 'wind_speed', 'humidity',
       'thunderstorm_probability', 'wind_gust_speed', 'weather_description'],
      dtype='object')

In [175]:
met_df.columns

Index(['data.instant.details.air_pressure_at_sea_level',
       'data.instant.details.air_temperature',
       'data.instant.details.cloud_area_fraction',
       'data.instant.details.relative_humidity',
       'data.instant.details.wind_from_direction',
       'data.instant.details.wind_speed'],
      dtype='object')

In [196]:
met_df.sort_values('valid_time')

Unnamed: 0,data.instant.details.air_pressure_at_sea_level,data.instant.details.air_temperature,data.instant.details.cloud_area_fraction,data.instant.details.relative_humidity,data.instant.details.wind_from_direction,data.instant.details.wind_speed,valid_time
0,1018.3,15.1,79.4,81.1,89.7,1.2,2024-08-20 06:00:00+00:00
1,1018.7,15.2,89.2,81.3,104.4,2.1,2024-08-20 07:00:00+00:00
2,1018.9,15.4,90.7,85.3,101.4,2.3,2024-08-20 08:00:00+00:00
3,1019.0,15.8,84.2,87.7,101.3,2.2,2024-08-20 09:00:00+00:00
4,1019.1,16.4,88.5,87.1,110.3,2.1,2024-08-20 10:00:00+00:00
...,...,...,...,...,...,...,...
81,1024.2,17.2,59.8,83.9,249.4,4.0,2024-08-29 06:00:00+00:00
82,1022.0,18.3,88.3,79.2,204.8,4.9,2024-08-29 12:00:00+00:00
83,1020.0,18.2,11.3,78.5,252.0,4.5,2024-08-29 18:00:00+00:00
84,1022.4,17.4,24.2,85.1,288.8,4.5,2024-08-30 00:00:00+00:00
