In [13]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
from tsq.data_card import save_data_card_html, save_data_card_json
import numpy as np
from IPython import display


In [14]:
# fetch dataset 
air_quality = fetch_ucirepo(id=360) 
  
# data (as pandas dataframes) 
X = air_quality.data.features 


In [15]:

def to_tsq_tidy_air_quality(df_wide: pd.DataFrame, entity: str = "Station_1") -> pd.DataFrame:
    """
    Transform UCI Air Quality wide table into tidy TSQ format:
    columns: ['timestamp', 'entity', 'variable', 'value'].

    Assumptions:
    - 'Date' is 'dd/mm/yyyy' (day-first).
    - 'Time' is 'HH:MM:SS'.
    - Sentinel missing values are -200 (replaced with NaN).
    - All columns except Date/Time are variables to be melted.
    """
    df = df_wide.copy()

    # 1) Build timestamp (day-first per UCI dataset), keep NaT if malformed
    df['timestamp'] = pd.to_datetime(df['Date'].astype(str) + ' ' + df['Time'].astype(str),
                                     dayfirst=True, errors='coerce')

    # 2) Replace sentinel missing value -200 with NaN across numeric-looking columns
    # First coerce numeric columns where possible
    for col in df.columns:
        if col not in ['Date', 'Time', 'timestamp']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    df.replace(-200, np.nan, inplace=True)

    # 3) Melt to long
    value_cols = [c for c in df.columns if c not in ['Date', 'Time', 'timestamp']]
    tidy = df.melt(id_vars=['timestamp'], value_vars=value_cols,
                   var_name='variable', value_name='value')

    # 4) Add entity and order columns
    tidy['entity'] = entity
    tidy = tidy[['timestamp', 'entity', 'variable', 'value']].sort_values(['entity', 'timestamp', 'variable'])

    # 5) Optional: drop rows where timestamp is NaT and all values are NaN
    tidy = tidy[~tidy['timestamp'].isna()]

    return tidy


In [16]:
tidy_df = to_tsq_tidy_air_quality(X, entity="Milan_Station")

In [17]:
# create and save data card as HTML and JSON
save_data_card_html(tidy_df, "Air Quality Milan Station", "air_quality_data_card.html")
save_data_card_json(tidy_df, "Air Quality Milan Station", "air_quality_data_card.json")


In [18]:
# show saved html file in notebook

display.HTML("air_quality_data_card.html")

Variable,Obs,Missing,Min,Max
AH,3597,179,0.1847,2.231
C6H6(GT),3597,179,0.2,48.2
CO(GT),3597,584,0.1,9.4
NMHC(GT),3597,3303,7.0,1084.0
NO2(GT),3597,687,5.0,340.0
NOx(GT),3597,685,2.0,1247.0
PT08.S1(CO),3597,179,689.0,1915.0
PT08.S2(NMHC),3597,179,387.0,1935.0
PT08.S3(NOx),3597,179,328.0,2683.0
PT08.S4(NO2),3597,179,551.0,2746.0

Variable,Obs,Missing,Min,Max
AH,3597,179,0.1847,2.231
C6H6(GT),3597,179,0.2,48.2
CO(GT),3597,584,0.1,9.4
NMHC(GT),3597,3303,7.0,1084.0
NO2(GT),3597,687,5.0,340.0
NOx(GT),3597,685,2.0,1247.0
PT08.S1(CO),3597,179,689.0,1915.0
PT08.S2(NMHC),3597,179,387.0,1935.0
PT08.S3(NOx),3597,179,328.0,2683.0
PT08.S4(NO2),3597,179,551.0,2746.0
