In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
filename = "../data/raw/AirQualityUCI.csv"
df = pd.read_csv(
    filename,
    delimiter=';',
    decimal=',',
)

# parsing and dropping columns
df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H.%M.%S')
df.drop(columns=['Date', 'Time', 'Unnamed: 15', 'Unnamed: 16'], inplace=True)

# missing values
df.dropna(how='all', inplace=True)
df.replace(-200, pd.NA, inplace=True)
col = [c for c in df.columns if c not in ['DateTime']]
df[col] = df[col].apply(pd.to_numeric, errors='coerce')
df.interpolate(method='linear', inplace=True)

# round
c = [c for c in df.columns if c not in ['DateTime', 'AH']]
df[c] = df[c].apply(lambda x: x.round(1))

# index datetime
df.set_index('DateTime', inplace=True)

# save this cleaned data as csv
df

In [None]:
# global theme and palette
sns.set_theme(style="darkgrid", palette='crest')

fig, ax = plt.subplots(figsize=(8,4))
sns.histplot(df['CO(GT)'], bins=35, kde=True, ax=ax)
ax.set_title('Distribution of CO Concentration (Ground Truth)', fontsize=15, fontweight='bold')
ax.set_xlabel('CO Concentration (GT)')
plt.tight_layout()