In [2]:
# Imports.
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Loading data from EDA step.
df = pd.read_csv('../data/raw/AirQualityUCI.csv', sep=';', decimal=',', header=0)

# Dropping unnamed empty columns.
df.dropna(axis=1, how='all', inplace=True)

# Replacing -200 with NaN (as per dataset documentation)
df.replace(-200, np.nan, inplace=True)

# Converting Date and Time to a single datetime index.
if 'Date' in df.columns and 'Time' in df.columns:
    df['Datetime'] = pd.to_datetime(df['Date'].astype(str) + ' ' + df['Time'].astype(str), errors='coerce')
    df.drop(columns=['Date', 'Time'], inplace=True)
    df.set_index('Datetime', inplace=True)

# Converting pollutant columns to numeric explicitly.
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Handling missing values
# Fill missing values using forward fill followed by backward fill as backup
df_filled = df.fillna(method='ffill').fillna(method='bfill')

# (Optional) Handle extreme outliers if necessary (commented out here)
# from scipy.stats import zscore
# df_filled = df_filled[(np.abs(zscore(df_filled)) < 4).all(axis=1)]

# Confirm final missing values
print("Remaining missing values per column after imputation:")
print(df_filled.isnull().sum())

# Save cleaned/preprocessed data
df_filled.to_csv('../data/processed/air_quality_cleaned.csv', index=True)

print("\n✅ Preprocessing complete. Cleaned data saved to: '../data/processed/air_quality_cleaned.csv'")


Remaining missing values per column after imputation:
CO(GT)           0
PT08.S1(CO)      0
NMHC(GT)         0
C6H6(GT)         0
PT08.S2(NMHC)    0
NOx(GT)          0
PT08.S3(NOx)     0
NO2(GT)          0
PT08.S4(NO2)     0
PT08.S5(O3)      0
T                0
RH               0
AH               0
dtype: int64

✅ Preprocessing complete. Cleaned data saved to: '../data/processed/air_quality_cleaned.csv'
