In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [4]:
df = pd.read_csv('AirQuality.csv', sep=';', decimal=',')
df = df.loc[:, ~df.columns.str.contains('Unnamed')]
df.head()


Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,10/03/2004,18.00.00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
1,10/03/2004,19.00.00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2,10/03/2004,20.00.00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
3,10/03/2004,21.00.00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
4,10/03/2004,22.00.00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888


In [5]:
df = df.replace(-200, np.nan)
for col in df.select_dtypes(include=['object']).columns:
    if col not in ['Date', 'Time']:
        df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', '.'), errors='coerce')


In [6]:
df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H.%M.%S')
df = df.drop(columns=['Date', 'Time'])
df = df.sort_values('Datetime').reset_index(drop=True)


In [7]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

for col in numeric_cols:
    missing_pct = df[col].isna().sum() / len(df)
    if missing_pct > 0.5:
        df = df.drop(columns=[col])
    else:
        df[col] = df[col].fillna(df[col].median())


In [8]:
df['hour'] = df['Datetime'].dt.hour
df['day_of_week'] = df['Datetime'].dt.dayofweek
df['month'] = df['Datetime'].dt.month


In [13]:
target_cols = ['NO2(GT)', 'NOx(GT)', 'C6H6(GT)', 'PT08.S1(CO)', 'PT08.S2(NMHC)', 'PT08.S3(NOx)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH']
available_cols = [col for col in target_cols if col in df.columns]
df_clean = df[['Datetime', 'hour', 'day_of_week', 'month'] + available_cols].copy()
df_clean = df_clean.dropna()
df_clean.head()

Unnamed: 0,Datetime,hour,day_of_week,month,NO2(GT),NOx(GT),C6H6(GT),PT08.S1(CO),PT08.S2(NMHC),PT08.S3(NOx),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10 18:00:00,18.0,2.0,3.0,113.0,166.0,11.9,1360.0,1046.0,1056.0,1692.0,1268.0,13.6,48.9,0.7578
1,2004-03-10 19:00:00,19.0,2.0,3.0,92.0,103.0,9.4,1292.0,955.0,1174.0,1559.0,972.0,13.3,47.7,0.7255
2,2004-03-10 20:00:00,20.0,2.0,3.0,114.0,131.0,9.0,1402.0,939.0,1140.0,1555.0,1074.0,11.9,54.0,0.7502
3,2004-03-10 21:00:00,21.0,2.0,3.0,122.0,172.0,9.2,1376.0,948.0,1092.0,1584.0,1203.0,11.0,60.0,0.7867
4,2004-03-10 22:00:00,22.0,2.0,3.0,116.0,131.0,6.5,1272.0,836.0,1205.0,1490.0,1110.0,11.2,59.6,0.7888


In [12]:
df_clean.to_csv('justin_clean_airquality.csv', index=False)
