In [None]:
# Upload kaggle.json (from your Kaggle account → API → Create New Token)
from google.colab import files
files.upload()   # Upload kaggle.json here

# Move kaggle.json to correct path
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download Dataset
!kaggle datasets download -d taweilo/taiwan-air-quality-data-20162024 -p ./ --unzip

#  Load Dataset
import pandas as pd
df = pd.read_csv("air_quality.csv", low_memory=False) # file name after unzip
print("Original Data:\n")
print(df.head())

Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/taweilo/taiwan-air-quality-data-20162024
License(s): CC0-1.0
Downloading taiwan-air-quality-data-20162024.zip to .
 77% 132M/172M [00:00<00:00, 1.37GB/s]
100% 172M/172M [00:00<00:00, 1.06GB/s]
Original Data:

               date   sitename          county   aqi pollutant    status  so2  \
0  2024-08-31 23:00      Hukou  Hsinchu County  62.0     PM2.5  Moderate  0.9   
1  2024-08-31 23:00  Zhongming   Taichung City  50.0       NaN      Good  1.6   
2  2024-08-31 23:00    Zhudong  Hsinchu County  45.0       NaN      Good  0.4   
3  2024-08-31 23:00    Hsinchu    Hsinchu City  42.0       NaN      Good  0.8   
4  2024-08-31 23:00     Toufen   Miaoli County  50.0       NaN      Good  1.0   

     co    o3 o3_8hr  ... windspeed winddirec unit co_8hr pm2.5_avg pm10_avg  \
0  0.17  35.0   40.2  ...       2.3       225  NaN    0.2      20.1     26.0   
1  0.32  27.9   35.1  ...       1.1       184  NaN    0.2      15

In [None]:
# Common imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.layouts import gridplot

# Display plots inline
output_notebook()

# Load dataset
df = pd.read_csv("air_quality.csv", low_memory=False)

# Basic cleaning
df = df.rename(columns={'date': 'Date'})
df['Date'] = pd.to_datetime(df['Date'], format='mixed')

# Convert pollutant columns to numeric, coercing errors
pollutant_cols = ['pm2.5', 'pm10', 'o3', 'co', 'no2', 'so2', 'no', 'nox']
for col in pollutant_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna(subset=['pm2.5', 'pm10', 'o3', 'co', 'no2'])

df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month_name()

# If there's a city or station column
if 'sitename' in df.columns:
    city_col = 'sitename'
elif 'Station' in df.columns:
    city_col = 'Station'
else:
    city_col = None

print("\n" * 2)






In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


# Load dataset
df = pd.read_csv("air_quality.csv", low_memory=False)

# Basic cleaning
df = df.rename(columns={'date': 'Date'})
df['Date'] = pd.to_datetime(df['Date'], format='mixed')

# Convert pollutant columns to numeric, coercing errors
pollutant_cols = ['pm2.5', 'pm10', 'o3', 'co', 'no2', 'so2', 'no', 'nox']
for col in pollutant_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna(subset=['pm2.5', 'pm10', 'o3', 'co', 'no2'])

df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month_name()

# If there's a city or station column
if 'sitename' in df.columns:
    city_col = 'sitename'
elif 'Station' in df.columns:
    city_col = 'Station'
else:
    city_col = None

# 1. Line plot: AQI Over Time for sample sites
sample_sites = df['sitename'].unique()[:3] # Changed 'site' to 'sitename' based on df head
df_sample = df[df['sitename'].isin(sample_sites)] # Changed 'site' to 'sitename'
fig1 = px.line(df_sample, x='Date', y='aqi', color='sitename', title='AQI Over Time for Sample Sites') # Changed 'date' to 'Date', 'AQI' to 'aqi', and 'site' to 'sitename'
print("1. Line plot: AQI Over Time for Sample Sites")
fig1.show()

# 2. Bar chart: Average pollutant levels by site (top 10)
station_avg = df.groupby('sitename')[['pm2.5', 'pm10', 'o3', 'no2', 'so2', 'co']].mean().reset_index() # Changed 'site' to 'sitename' and column names to lowercase
top_sites_avg = station_avg.sort_values('pm2.5', ascending=False).head(10)
fig2 = px.bar(top_sites_avg, x='sitename', y=['pm2.5', 'pm10', 'o3'], barmode='group', # Changed 'site' to 'sitename' and column names to lowercase
              title='Top 10 Sites Average Pollutants')
print("\n2. Bar chart: Average Pollutant Levels by Site (Top 10)")
fig2.show()

# 3. Scatter plot: PM2.5 vs O3 colored by AQI quartile
df['AQI_quartile'] = pd.qcut(df['aqi'], q=4, labels=['Low', 'Moderate', 'High', 'Very High']) # Changed 'AQI' to 'aqi'
fig3 = px.scatter(df.sample(3000), x='pm2.5', y='o3', color='AQI_quartile', # Changed column names to lowercase
                  title='PM2.5 vs O3 by AQI Quartile',
                  labels={'pm2.5': 'PM2.5 (µg/m³)', 'o3': 'O3 (ppb)'}) # Changed column names to lowercase
print("\n3. Scatter plot: PM2.5 vs O3 colored by AQI Quartile")
fig3.show()

# 4. Heatmap: Correlation matrix of pollutants
corr = df[['pm2.5', 'pm10', 'o3', 'no2', 'so2', 'co']].corr() # Changed column names to lowercase
fig4 = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu_r', title='Pollutant Correlation Heatmap')
print("\n4. Heatmap: Correlation Matrix of Pollutants")
fig4.show()

# 5. Area plot: Monthly average PM2.5 and PM10
df_monthly = df.resample('M', on='Date')[['pm2.5', 'pm10']].mean().reset_index() # Changed 'date' to 'Date' and column names to lowercase
fig5 = go.Figure()
fig5.add_trace(go.Scatter(x=df_monthly['Date'], y=df_monthly['pm2.5'], fill='tozeroy', name='PM2.5')) # Changed 'date' to 'Date' and column names to lowercase
fig5.add_trace(go.Scatter(x=df_monthly['Date'], y=df_monthly['pm10'], fill='tozeroy', name='PM10')) # Changed 'date' to 'Date' and column names to lowercase
fig5.update_layout(title='Monthly Avg PM2.5 & PM10', xaxis_title='Month', yaxis_title='Concentration (µg/m³)')
print("\n5. Area plot: Monthly Average PM2.5 and PM10")
fig5.show()

# 6. Box plot: AQI by month
df['month'] = df['Date'].dt.month # Changed 'date' to 'Date'
fig6 = px.box(df, x='month', y='aqi', title='Monthly AQI Distribution') # Changed 'AQI' to 'aqi'
print("\n6. Box plot: AQI by Month")
fig6.show()