# LAB 1

In [27]:
import datetime
import requests
import pandas as pd
import hopsworks
import datetime
from pathlib import Path
from functions import util
import json
import re
import os
import warnings
warnings.filterwarnings("ignore")

## 1.Backfill feature pipeline

Write a backfill feature pipeline that downloads historical weather data (ideally >1
year of data), loads a csv file with historical air quality data (downloaded from
https://aqicn.org) and registers them as 2 Feature Groups with Hopsworks.

### 1.1 Import air quality data on 5 years at Stockholm

In [28]:
air_quality_df = pd.read_csv("../../data/stockholm-hornsgatan 108 gata-air-quality.csv")

air_quality_df['date'] = pd.to_datetime(air_quality_df['date'], format='%Y/%m/%d')

# Handle non-numeric values in 'pm25' and 'pm10', replace them with NaN
air_quality_df['pm25'] = pd.to_numeric(air_quality_df['pm25'], errors='coerce')
air_quality_df['pm10'] = pd.to_numeric(air_quality_df['pm10'], errors='coerce')

# Fill NaN values with 0 (or use mean or another method depending on your choice)
air_quality_df['pm25'].fillna(0, inplace=True)
air_quality_df['pm10'].fillna(0, inplace=True)

# Convert 'pm25' and 'pm10' columns to integers
air_quality_df['pm25'] = air_quality_df['pm25'].astype(int)
air_quality_df['pm10'] = air_quality_df['pm10'].astype(int)


air_quality_df = air_quality_df.drop(columns=['no2'])


air_quality_df.head()

Unnamed: 0,date,pm25,pm10
0,2024-11-01,8,3
1,2024-11-02,5,3
2,2024-11-03,6,6
3,2024-11-04,7,4
4,2024-11-05,11,6


### 1.2 Import weather data on one year at Stockholm

In [29]:
import requests
import pandas as pd
import json
from datetime import datetime, timedelta

# Coordinates of Stockholm and date range for historical data
latitude = 59.3293
longitude = 18.0686
today = datetime.today().date()
start_date = today - timedelta(days=365)
end_date = today - timedelta(days=2)

country="sweden"
city = "stockholm"
street = "stockholm-hornsgatan-108"
aqicn_url="https://api.waqi.info/feed/@10009"

earliest_aq_date = pd.Series.min(air_quality_df['date'])
earliest_aq_date = earliest_aq_date.strftime('%Y-%m-%d')

# Calculate daily average temperature
df_weather = util.get_historical_weather(city, earliest_aq_date, str(today), latitude, longitude)

# Print the result
df_weather = df_weather.drop(columns='city')

df_weather



Coordinates 59.29701232910156°N 18.163265228271484°E
Elevation 24.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


Unnamed: 0,date,temperature_2m_mean,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant
0,2017-10-03,11.925418,0.500000,18.391737,209.271942
1,2017-10-04,10.548332,2.900000,22.206486,248.656326
2,2017-10-05,8.394166,2.600000,16.595179,306.521240
3,2017-10-06,8.208749,0.500000,23.871555,320.408325
4,2017-10-07,6.092083,6.300001,14.332341,343.018829
...,...,...,...,...,...
2593,2024-11-08,5.229583,0.000000,13.864674,257.650360
2594,2024-11-09,5.579584,0.000000,14.428097,257.365479
2595,2024-11-10,5.473333,0.100000,9.292255,145.380463
2596,2024-11-11,4.869166,1.100000,9.676569,234.773972


### 1.3 Connection to hopsworks

In [30]:
import hopsworks

with open('../../data/hopsworks-api-key.txt', 'r') as file:
    os.environ["HOPSWORKS_API_KEY"] = file.read().rstrip()

In [31]:


project = hopsworks.login(project="ID2223LAB1KTH")
fs = project.get_feature_store()


Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1170583
Connected. Call `.close()` to terminate connection gracefully.


In [None]:
# Create a weather data feature group
weather_fg = fs.create_feature_group(
    name="stockholm_weather",
    version=1,
    description="Weather data for Stockholm including temperature, humidity, wind speed, and wind direction",
    primary_key=["date"],  
    event_time="date" 
)


weather_fg.insert(df_weather)

print("Weather feature group created and data inserted successfully.")


Uploading Dataframe: 0.00% |          | Rows 0/2598 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: stockholm_weather_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/1170583/jobs/named/stockholm_weather_1_offline_fg_materialization/executions
Weather feature group created and data inserted successfully.


In [24]:

air_quality_fg = fs.create_feature_group(
    name="stockholm_air_quality",
    version=1,
    description="Air quality data for Stockholm with PM2.5 concentrations",
    primary_key=["date"],  # 'time' column as the primary key
    event_time="date"      # Specify 'time' as the event time
)


air_quality_fg.insert(air_quality_df)

print("Air quality feature group created and data inserted successfully.")


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1170583/fs/1161286/fg/1347950


Uploading Dataframe: 0.00% |          | Rows 0/2557 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: stockholm_air_quality_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/1170583/jobs/named/stockholm_air_quality_1_offline_fg_materialization/executions
Air quality feature group created and data inserted successfully.


## 2. Schedule a daily feature pipeline

In .github/workflows

## 3.Write a training pipeline

In [37]:
# Load the Feature Groups for air quality and weather data
air_quality_fg = fs.get_feature_group(name="stockholm_air_quality", version=1)
weather_fg = fs.get_feature_group(name="stockholm_weather", version=1)

# Read the data from both feature groups
air_quality_df = air_quality_fg.read()
weather_df = weather_fg.read()

# Merge the two feature groups on the 'time' column
merged_df = air_quality_df.merge(weather_df, on="date", how="inner")

# Select features and target for training
features = merged_df[['temperature_2m_mean','precipitation_sum','wind_speed_10m_max','wind_direction_10m_dominant']]
target = merged_df['pm25']

merged_df



Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.87s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.85s) 


Unnamed: 0,date,pm25,pm10,temperature_2m_mean,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant
0,2023-01-23 00:00:00+00:00,42,7,-4.230833,0.0,18.079027,241.394394
1,2023-05-27 00:00:00+00:00,16,23,12.208749,0.0,19.245697,193.671310
2,2019-09-24 00:00:00+00:00,10,9,9.562917,0.0,11.113451,191.237717
3,2018-06-14 00:00:00+00:00,20,23,16.552500,0.0,24.336637,180.159409
4,2019-11-17 00:00:00+00:00,40,19,6.633749,0.2,22.267679,199.835754
...,...,...,...,...,...,...,...
2550,2022-01-29 00:00:00+00:00,8,2,2.225417,10.7,23.688984,215.975143
2551,2019-06-01 00:00:00+00:00,14,10,12.327500,3.6,18.861387,225.602020
2552,2019-10-09 00:00:00+00:00,21,11,6.500416,1.7,11.609651,98.886917
2553,2019-02-02 00:00:00+00:00,39,9,0.490000,5.7,18.723461,80.114799
