In [73]:
# Goal: Predict NO₂ concentration from meteorological and pollutant data.
# Extra: Classify whether a day’s NO₂ level exceeds a WHO-recommended threshold.

In [98]:
# Import dependecies
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [100]:
# Load dataset and confirm it's there
df = pd.read_csv("../data/AirQualityUCI.csv", sep = ";")
display(df)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,10/03/2004,18.00.00,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,07578,,
1,10/03/2004,19.00.00,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,07255,,
2,10/03/2004,20.00.00,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,07502,,
3,10/03/2004,21.00.00,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,07867,,
4,10/03/2004,22.00.00,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,07888,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9466,,,,,,,,,,,,,,,,,
9467,,,,,,,,,,,,,,,,,
9468,,,,,,,,,,,,,,,,,
9469,,,,,,,,,,,,,,,,,


## Data Cleaning

In [103]:
# Dropping all empty rows
df_drop = df.dropna(how = "all")
display(df_drop)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,10/03/2004,18.00.00,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,07578,,
1,10/03/2004,19.00.00,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,07255,,
2,10/03/2004,20.00.00,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,07502,,
3,10/03/2004,21.00.00,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,07867,,
4,10/03/2004,22.00.00,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,07888,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,04/04/2005,10.00.00,31,1314.0,-200.0,135,1101.0,472.0,539.0,190.0,1374.0,1729.0,219,293,07568,,
9353,04/04/2005,11.00.00,24,1163.0,-200.0,114,1027.0,353.0,604.0,179.0,1264.0,1269.0,243,237,07119,,
9354,04/04/2005,12.00.00,24,1142.0,-200.0,124,1063.0,293.0,603.0,175.0,1241.0,1092.0,269,183,06406,,
9355,04/04/2005,13.00.00,21,1003.0,-200.0,95,961.0,235.0,702.0,156.0,1041.0,770.0,283,135,05139,,


In [105]:
# Drop columns "Unamed 15" and "Unamed 16" if all entries are NaN
print(f"Before Drop: {list(df_drop.columns.values)}", end="\n\n") # Before drop

unamed_cols = ["Unnamed: 15", "Unnamed: 16"]
for col in unamed_cols:
    if df_drop[col].isna().all():
        df_drop = df_drop.drop(columns=[col])
        
print(f"After Drop: {list(df_drop.columns.values)}") # After drop

Before Drop: ['Date', 'Time', 'CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH', 'Unnamed: 15', 'Unnamed: 16']

After Drop: ['Date', 'Time', 'CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH']


In [107]:
# Values with -200 likekly a placeholder for invalid or missing data
df_drop.replace(-200, np.nan, inplace=True)
display(df_drop)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,10/03/2004,18.00.00,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,07578
1,10/03/2004,19.00.00,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,07255
2,10/03/2004,20.00.00,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,07502
3,10/03/2004,21.00.00,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,07867
4,10/03/2004,22.00.00,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,07888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,04/04/2005,10.00.00,31,1314.0,,135,1101.0,472.0,539.0,190.0,1374.0,1729.0,219,293,07568
9353,04/04/2005,11.00.00,24,1163.0,,114,1027.0,353.0,604.0,179.0,1264.0,1269.0,243,237,07119
9354,04/04/2005,12.00.00,24,1142.0,,124,1063.0,293.0,603.0,175.0,1241.0,1092.0,269,183,06406
9355,04/04/2005,13.00.00,21,1003.0,,95,961.0,235.0,702.0,156.0,1041.0,770.0,283,135,05139


In [109]:
# Checking how much data is missing from a given column
missing_percent = df_drop.isna().mean() * 100
display(pd.DataFrame(missing_percent.sort_values(ascending=False), columns=["Missing Values in %"]))

Unnamed: 0,Missing Values in %
NMHC(GT),90.231912
NO2(GT),17.54836
NOx(GT),17.516298
PT08.S1(CO),3.91151
PT08.S2(NMHC),3.91151
PT08.S3(NOx),3.91151
PT08.S4(NO2),3.91151
PT08.S5(O3),3.91151
Date,0.0
Time,0.0


In [111]:
# NMHC(GT) is missing ~90% of its data, so we drop the feature
df_drop.drop(columns=["NMHC(GT)"], inplace=True)

In [113]:
# For the remaining columns, I've decided to impute them with the median if
# the skewness is > 0.5 or < -0.5. Otherwise impute with the means
print(df_drop["NO2(GT)"].skew(), df_drop["NOx(GT)"].skew())

0.6217143134373714 1.7157807992815408


In [115]:
# Imputed remaining columns with the median
# NO2(GT) and NOx(GT) is missing ~17% of its data, replace NaN with median
df_drop["NO2(GT)"] = df_drop["NO2(GT)"].fillna(df_drop["NO2(GT)"].median())
df_drop["NOx(GT)"] = df_drop["NOx(GT)"].fillna(df_drop["NOx(GT)"].median())

# Columns missing ~4% of its data, replace NaN with median
sensors_cols = [
    "PT08.S1(CO)",
    "PT08.S2(NMHC)",
    "PT08.S3(NOx)",
    "PT08.S4(NO2)",
    "PT08.S5(O3)"
]

for col in sensors_cols:
    df_drop[col] = df_drop[col].fillna(df_drop[col].median())

In [117]:
# Verifying if the data was properly cleaned and imputed
"No NaN values were found" if not df_drop.isnull().values.any() else "NaN value was found"

'No NaN values were found'

In [119]:
# Created a copy of the cleaned data to avoid modifying the original by reference
D = df_drop.copy()
display(D)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,10/03/2004,18.00.00,26,1360.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,07578
1,10/03/2004,19.00.00,2,1292.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,07255
2,10/03/2004,20.00.00,22,1402.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,07502
3,10/03/2004,21.00.00,22,1376.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,07867
4,10/03/2004,22.00.00,16,1272.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,07888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,04/04/2005,10.00.00,31,1314.0,135,1101.0,472.0,539.0,190.0,1374.0,1729.0,219,293,07568
9353,04/04/2005,11.00.00,24,1163.0,114,1027.0,353.0,604.0,179.0,1264.0,1269.0,243,237,07119
9354,04/04/2005,12.00.00,24,1142.0,124,1063.0,293.0,603.0,175.0,1241.0,1092.0,269,183,06406
9355,04/04/2005,13.00.00,21,1003.0,95,961.0,235.0,702.0,156.0,1041.0,770.0,283,135,05139


## Exploratory Data Analysis (EDA)