<a href="https://colab.research.google.com/github/KondamPravalikaReddy/AirAware-Smart-Air-Quality-Prediction/blob/main/Bokeh_Visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Upload kaggle.json (from your Kaggle account → API → Create New Token)
from google.colab import files
files.upload()   # Upload kaggle.json here

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"pravalikareddi","key":"cc23ca85e0877e600aaa326978d10bad"}'}

In [3]:
# Move kaggle.json to correct path
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download Dataset
!kaggle datasets download -d fedesoriano/air-quality-data-set --unzip

#  Load Dataset
import pandas as pd
# file name after unzip
df = pd.read_csv("AirQuality.csv", low_memory=False, delimiter=';')
print("Original Data:\n")
print(df.head())

Dataset URL: https://www.kaggle.com/datasets/fedesoriano/air-quality-data-set
License(s): copyright-authors
Downloading air-quality-data-set.zip to /content
  0% 0.00/248k [00:00<?, ?B/s]
100% 248k/248k [00:00<00:00, 348MB/s]
Original Data:

         Date      Time CO(GT)  PT08.S1(CO)  NMHC(GT) C6H6(GT)  PT08.S2(NMHC)  \
0  10/03/2004  18.00.00    2,6       1360.0     150.0     11,9         1046.0   
1  10/03/2004  19.00.00      2       1292.0     112.0      9,4          955.0   
2  10/03/2004  20.00.00    2,2       1402.0      88.0      9,0          939.0   
3  10/03/2004  21.00.00    2,2       1376.0      80.0      9,2          948.0   
4  10/03/2004  22.00.00    1,6       1272.0      51.0      6,5          836.0   

   NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)     T    RH  \
0    166.0        1056.0    113.0        1692.0       1268.0  13,6  48,9   
1    103.0        1174.0     92.0        1559.0        972.0  13,3  47,7   
2    131.0        1140.0    114.0        15

In [4]:
# Common imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.layouts import gridplot

# Display plots inline
output_notebook()

# Load dataset
df = pd.read_csv("AirQuality.csv", low_memory=False, delimiter=';')

# Basic cleaning
# Rename columns to lowercase and remove periods for easier access
df.columns = df.columns.str.lower().str.replace('.', '', regex=False).str.strip()

# Drop the last two columns as they appear to be empty based on the head output
df = df.iloc[:, :-2]

df = df.rename(columns={'date': 'Date', 'time': 'Time'})


# Convert Date and Time to datetime
df['datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H.%M.%S', errors='coerce')

# Convert pollutant columns to numeric, coercing errors
# Based on the output of df.head(), the pollutant columns are CO(GT), PT08.S1(CO), etc.
# Let's identify them based on the new lowercase, no-period names
pollutant_cols = ['co(gt)', 'pt08s1(co)', 'nmhc(gt)', 'c6h6(gt)', 'pt08s2(nmhc)', 'nox(gt)', 'pt08s3(nox)', 'no2(gt)', 'pt08s4(no2)', 'pt08s5(o3)', 't', 'rh', 'ah']
for col in pollutant_cols:
    if col in df.columns:
        # Replace commas with periods for European decimal format
        df[col] = df[col].astype(str).str.replace(',', '.', regex=False)
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows where datetime is NaT
df.dropna(subset=['datetime'], inplace=True)

# For this dataset, there is no 'pm2.5', 'pm10', 'o3', 'co', 'no2' as column names.
# We will keep the cleaning steps that are applicable to the new column names.
# Based on the column names, it seems we have 'co(gt)', 'c6h6(gt)', 'nox(gt)', 'no2(gt)', 'pt08s5(o3)'.
# Let's drop rows with NaNs in these key pollutant columns that seem to correspond to pm2.5, pm10, o3, co, no2 in the previous dataset.
# Based on the column names and a quick search for the dataset, the relevant columns seem to be:
# 'co(gt)' - CO
# 'c6h6(gt)' - Benzene (related to VOCs/NMHC)
# 'nox(gt)' - NOx
# 'no2(gt)' - NO2
# 'pt08s5(o3)' - O3 (sensor reading)
# Let's use these as the key pollutant columns to drop NaNs.
key_pollutant_cols = ['co(gt)', 'c6h6(gt)', 'nox(gt)', 'no2(gt)', 'pt08s5(o3)']
df = df.dropna(subset=key_pollutant_cols)

df['Year'] = df['datetime'].dt.year
df['Month'] = df['datetime'].dt.month_name()

# If there's a city or station column
if 'sitename' in df.columns:
    city_col = 'sitename'
elif 'Station' in df.columns:
    city_col = 'Station'
else:
    city_col = None

# Display the first few rows and info of the cleaned dataframe
print("\nCleaned Data:\n")
print(df.head())
print("\nCleaned Data Info:\n")
df.info()


Cleaned Data:

         Date      Time  co(gt)  pt08s1(co)  nmhc(gt)  c6h6(gt)  pt08s2(nmhc)  \
0  10/03/2004  18.00.00     2.6      1360.0     150.0      11.9        1046.0   
1  10/03/2004  19.00.00     2.0      1292.0     112.0       9.4         955.0   
2  10/03/2004  20.00.00     2.2      1402.0      88.0       9.0         939.0   
3  10/03/2004  21.00.00     2.2      1376.0      80.0       9.2         948.0   
4  10/03/2004  22.00.00     1.6      1272.0      51.0       6.5         836.0   

   nox(gt)  pt08s3(nox)  no2(gt)  pt08s4(no2)  pt08s5(o3)     t    rh      ah  \
0    166.0       1056.0    113.0       1692.0      1268.0  13.6  48.9  0.7578   
1    103.0       1174.0     92.0       1559.0       972.0  13.3  47.7  0.7255   
2    131.0       1140.0    114.0       1555.0      1074.0  11.9  54.0  0.7502   
3    172.0       1092.0    122.0       1584.0      1203.0  11.0  60.0  0.7867   
4    131.0       1205.0    116.0       1490.0      1110.0  11.2  59.6  0.7888   

          

In [5]:
# ==================== IMPORTS ====================
import pandas as pd
import numpy as np
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool, ColorBar
from bokeh.layouts import column, gridplot
from bokeh.palettes import Spectral11, Viridis256
from bokeh.transform import linear_cmap
output_notebook()


In [6]:
#Histogram of AQI Distribution
import numpy as np # Import numpy
# This dataset does not have an 'aqi' column. We can plot a histogram of a different pollutant, e.g., 'co(gt)'
# Based on the dataset description, 'co(gt)' is True hourly averaged concentration CO in mg/m^3
hist, edges = np.histogram(df['co(gt)'].dropna(), bins=30)
p1 = figure(width=800, height=350, title="Distribution of CO(GT) Concentration")
p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="skyblue", line_color="white", alpha=0.8)
p1.xaxis.axis_label = 'CO(GT) (mg/m³)'
p1.yaxis.axis_label = 'Frequency'
show(p1)

In [8]:
# Scatter with color mapping: NOx(GT) by CO(GT)
from bokeh.transform import linear_cmap # Import linear_cmap

# Let's plot NOx(GT) vs CO(GT) and color by O3 sensor reading (pt08s5(o3))
if not df.dropna(subset=['nox(gt)', 'co(gt)', 'pt08s5(o3)']).empty:
    source9 = ColumnDataSource(df.dropna(subset=['nox(gt)', 'co(gt)', 'pt08s5(o3)']).sample(min(3000, len(df.dropna(subset=['nox(gt)', 'co(gt)', 'pt08s5(o3)'])))))
    color_mapper = linear_cmap('pt08s5(o3)', 'Viridis256', df['pt08s5(o3)'].min(), df['pt08s5(o3)'].max())
    p2 = figure(title="NOx(GT) vs CO(GT) (Color by PT08.S5(O3))", width=600, height=400)
    p2.scatter('co(gt)', 'nox(gt)', source=source9, color=color_mapper, size=6, alpha=0.6)
    p2.xaxis.axis_label = "CO(GT) (mg/m³)"
    p2.yaxis.axis_label = "NOx(GT) (ppb)"
    p2.add_tools(HoverTool(tooltips=[("PT08.S5(O3)", "@`pt08s5(o3)`"), ("Datetime", "@datetime{%F %H:%M}")], formatters={'@datetime': 'datetime'}))
else:
    p2 = figure(title="Not enough data for NOx(GT) vs CO(GT) scatter plot", width=600, height=400)
from bokeh.io import output_notebook, show
output_notebook()
show(p2)

In [10]:
#Heatmap: Correlation matrix
corr_cols = ['co(gt)', 'c6h6(gt)', 'nox(gt)', 'no2(gt)', 'pt08s5(o3)', 't', 'rh', 'ah']
if not df[corr_cols].dropna().empty:
    corr = df[corr_cols].corr()
    corr = corr.stack().reset_index(name='corr')
    p3 = figure(title="Pollutant Correlation Heatmap", x_range=corr_cols, y_range=list(reversed(corr_cols)), width=600, height=400)
    p3.rect(x='level_0', y='level_1', width=1, height=1, source=ColumnDataSource(corr),
            color=linear_cmap('corr', 'Viridis256', -1, 1))
    # Add correlation values as text
    source_text = ColumnDataSource(corr)
    p3.text(x='level_0', y='level_1', text='corr', source=source_text,
            text_font_size="8pt", text_align="center", text_baseline="middle", color="black") # Added color for visibility
else:
    p3 = figure(title="Not enough data for Correlation Heatmap", width=600, height=400)
from bokeh.io import output_notebook, show
output_notebook()
show(p3)

In [12]:
# Area: Monthly average of selected pollutants
if not df.dropna(subset=['datetime', 'co(gt)', 'nox(gt)']).empty:

    df_monthly = df.resample('ME', on='datetime')[['co(gt)', 'nox(gt)']].mean().reset_index()
    source7 = ColumnDataSource(df_monthly)
    p4 = figure(title="Monthly Avg CO(GT) & NOx(GT)", x_axis_type='datetime', width=700, height=400)
    p4.varea(x='datetime', y1='co(gt)', y2=0, source=source7, color="blue", alpha=0.5, legend_label="CO(GT)")
    p4.varea(x='datetime', y1='nox(gt)', y2=0, source=source7, color="green", alpha=0.5, legend_label="NOx(GT)")
    p4.legend.location = "top_left"
    p4.yaxis.axis_label = "Concentration" # Units vary by pollutant
else:
    p4 = figure(title="Not enough data for Monthly Avg CO(GT) & NOx(GT) plot", width=700, height=400)
from bokeh.io import output_notebook, show
output_notebook()
show(p4)

In [13]:
# Multi-line: Pollutant trends over time
if not df.dropna(subset=['datetime', 'co(gt)', 'no2(gt)', 'pt08s5(o3)']).empty:
    df_daily = df.groupby('datetime')[['co(gt)', 'no2(gt)', 'pt08s5(o3)']].mean().reset_index()
    source10 = ColumnDataSource(df_daily)
    p5 = figure(title="Daily Avg Pollutants", x_axis_type='datetime', width=700, height=400)
    p5.line('datetime', 'co(gt)', source=source10, color="red", legend_label="CO(GT)", line_width=2)
    p5.line('datetime', 'no2(gt)', source=source10, color="blue", legend_label="NO2(GT)", line_width=2)
    p5.line('datetime', 'pt08s5(o3)', source=source10, color="green", legend_label="PT08.S5(O3)", line_width=2)
    p5.legend.location = "top_left"
    p5.yaxis.axis_label = "Concentration" # Units vary by pollutant
else:
    p5 = figure(title="Not enough data for Daily Avg Pollutants line plot", width=700, height=400)
from bokeh.io import output_notebook, show
output_notebook()
show(p5)

In [16]:
#  Scatter: CO(GT) vs PT08.S5(O3) with hover
if len(df.dropna(subset=['co(gt)', 'pt08s5(o3)'])) >= 100: # Lower sample size for potentially smaller dataset
    source1 = ColumnDataSource(df.dropna(subset=['co(gt)', 'pt08s5(o3)']).sample(min(2000, len(df.dropna(subset=['co(gt)', 'pt08s5(o3)']))))) # Sample up to 2000 points
    p6 = figure(title="CO(GT) vs PT08.S5(O3)", width=600, height=400, tools="hover,pan,wheel_zoom,box_zoom,reset")
    p6.scatter('co(gt)', 'pt08s5(o3)', source=source1, color="navy", alpha=0.5, size=6)
    p6.xaxis.axis_label = "CO(GT) (mg/m³)"
    p6.yaxis.axis_label = "PT08.S5(O3) (sensor reading)" # Use appropriate label for sensor data

    hover1 = p8.select_one(HoverTool) # Select hover tool from p8, not p1
    hover1.tooltips = [("Datetime", "@datetime{%F %H:%M}"), ("CO(GT)", "@`co(gt)`"), ("PT08.S5(O3)", "@`pt08s5(o3)`")]
    hover1.formatters = {'@datetime': 'datetime'}
else:
    p6 = figure(title="Not enough data for CO(GT) vs PT08.S5(O3) scatter plot", width=600, height=400)
from bokeh.io import output_notebook, show
output_notebook()
show(p6)