In [18]:
import pandas as pd
from functions import util
import os
import warnings

warnings.filterwarnings("ignore")

## 1.Backfill feature pipeline

Write a backfill feature pipeline that downloads historical weather data (ideally >1
year of data), loads a csv file with historical air quality data (downloaded from
https://aqicn.org) and registers them as 2 Feature Groups with Hopsworks.

### 1.1 Import air quality data on 5 years at Stockholm

In [19]:
air_quality_df = pd.read_csv("../../data/solna-råsundavägen_107-air-quality.csv")
air_quality_df.columns = [x.strip() for x in air_quality_df.columns]

air_quality_df['date'] = pd.to_datetime(air_quality_df['date'], format='%Y/%m/%d')

# Handle non-numeric values in 'pm25' and 'pm10', replace them with NaN
air_quality_df['pm25'] = pd.to_numeric(air_quality_df['pm25'], errors='coerce')
air_quality_df['pm10'] = pd.to_numeric(air_quality_df['pm10'], errors='coerce')

# Fill NaN values with 0 (or use mean or another method depending on your choice)
air_quality_df['pm25'].fillna(0, inplace=True)
air_quality_df['pm10'].fillna(0, inplace=True)

# Convert 'pm25' and 'pm10' columns to integers
air_quality_df['pm25'] = air_quality_df['pm25'].astype(int)
air_quality_df['pm10'] = air_quality_df['pm10'].astype(int)


air_quality_df = air_quality_df.drop(columns=['no2'])

air_quality_df.head()

Unnamed: 0,date,pm25,pm10
0,2024-04-01,29,6
1,2024-04-02,16,4
2,2024-04-03,9,8
3,2024-04-04,11,4
4,2024-04-05,13,6


### 1.2 Import weather data on one year at Stockholm

In [20]:
import pandas as pd
from datetime import datetime, timedelta

# Coordinates of Stockholm and date range for historical data
latitude = 59.36056
longitude = 17.99824
today = datetime.today().date()
start_date = today - timedelta(days=365)
end_date = today - timedelta(days=2)

country="sweden"
city = "solna"
street = "solna-rasundavagen-107"
aqicn_url="https://api.waqi.info/feed/@13988"

earliest_aq_date = pd.Series.min(air_quality_df['date'])
earliest_aq_date = earliest_aq_date.strftime('%Y-%m-%d')

# Calculate daily average temperature
df_weather = util.get_historical_weather(city, earliest_aq_date, str(today), latitude, longitude)

# Print the result
df_weather = df_weather.drop(columns='city')

df_weather


Coordinates 59.3673095703125°N 18.0°E
Elevation 16.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


Unnamed: 0,date,temperature_2m_max,temperature_2m_min,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant
0,2022-09-01,15.271500,7.4215,0.2,15.844090,345.856476
1,2022-09-02,13.871500,6.2215,0.8,14.440443,353.648407
2,2022-09-03,12.121500,5.6715,1.2,6.109403,325.175446
3,2022-09-04,15.621500,5.5215,0.2,9.720000,44.154472
4,2022-09-05,15.721499,3.6715,0.0,12.069400,334.778839
...,...,...,...,...,...,...
801,2024-11-10,6.771500,4.4215,0.0,9.102021,139.972229
802,2024-11-11,5.721500,3.8715,2.7,10.028439,236.030014
803,2024-11-12,5.021500,0.8215,0.0,10.196647,286.209778
804,2024-11-13,5.821500,0.3215,0.0,15.156384,239.627930


### 1.3 Connection to hopsworks

In [21]:
import hopsworks

with open('../../data/hopsworks-api-key.txt', 'r') as file:
    os.environ["HOPSWORKS_API_KEY"] = file.read().rstrip()

In [22]:
project = hopsworks.login(project="ID2223LAB1KTH")
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.



Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1170583
Connected. Call `.close()` to terminate connection gracefully.


### 1.4 Create feature groups and insert data

In [23]:
# Create a weather data feature group
weather_fg = fs.get_or_create_feature_group(
    name="stockholm_weather",
    version=1,
    description="Weather data for Stockholm including temperature, humidity, wind speed, and wind direction",
    primary_key=["date"],  
    event_time="date" 
)


weather_fg.insert(df_weather)

print("Weather feature group created and data inserted successfully.")


Uploading Dataframe: 0.00% |          | Rows 0/806 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: stockholm_weather_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/1170583/jobs/named/stockholm_weather_1_offline_fg_materialization/executions
Weather feature group created and data inserted successfully.


In [24]:

air_quality_fg = fs.get_or_create_feature_group(
    name="stockholm_air_quality",
    version=1,
    description="Air quality data for Stockholm with PM2.5 concentrations",
    primary_key=["date"],  # 'time' column as the primary key
    event_time="date"      # Specify 'time' as the event time
)


air_quality_fg.insert(air_quality_df)

print("Air quality feature group created and data inserted successfully.")


Uploading Dataframe: 0.00% |          | Rows 0/572 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: stockholm_air_quality_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/1170583/jobs/named/stockholm_air_quality_1_offline_fg_materialization/executions
Air quality feature group created and data inserted successfully.


## 2. Schedule a daily feature pipeline

In .github/workflows