# Port of Shanghai Investigation

We used GIOVANNI to pull data from the port of Shanghai. This code can be used to used to analyse any PM2.5, NO2, SO2 data from that source.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

# Set the date range to focus on
start_date = "2011-01-01"
end_date = "2021-01-01"

# Load the CSV file
df = pd.read_csv("nasa_data/port_of_shanghai_pm25_monthly.csv")

# Skip the first few rows that aren't actual data
df = df.iloc[5:].reset_index(drop=True)
df.columns = ["raw_date", "pm25_kg_m3"]

# Keep only rows with valid dates in the format YYYY-MM-DD
df = df[df["raw_date"].str.match(r"\d{4}-\d{2}-\d{2}")].reset_index(drop=True)

# Convert the date column to datetime format
df["date"] = pd.to_datetime(df["raw_date"])

# Make sure PM2.5 column is numeric
df["pm25_kg_m3"] = df["pm25_kg_m3"].astype(float)

# Filter the data to stay within the selected date range
df = df[(df["date"] >= pd.to_datetime(start_date)) & (df["date"] <= pd.to_datetime(end_date))]

# Convert from kilograms per cubic meter to micrograms per cubic meter
df["pm25_ug_m3"] = df["pm25_kg_m3"] * 1e9

# Plot the results
plt.figure(figsize=(10, 5))
plt.plot(df["date"], df["pm25_ug_m3"], color="blue", linewidth=1.5)
plt.title("PM2.5 Levels at Port of Shanghai")
plt.xlabel("Date")
plt.ylabel("PM2.5 (µg/m³)")
plt.grid(True)
plt.tight_layout()
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'nasa_data/port_of_shanghai_pm25_monthly.csv'

## Hourly to Daily/Weekly/Monthly Data (First attempt)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

# Define the time window to analyze
start_date = "2011-01-01"
end_date = "2025-01-01"

# Load the hourly PM2.5 data
df = pd.read_csv("nasa_data/port_of_shanghai_pm25_hourly.csv")

# Remove the top rows with metadata and rename columns
df = df.iloc[5:].reset_index(drop=True)
df.columns = ["raw_date", "pm25_kg_m3"]

# Keep rows with actual date strings
df = df[df["raw_date"].str.match(r"\d{4}-\d{2}-\d{2}")].reset_index(drop=True)

# Convert date column to datetime
df["date"] = pd.to_datetime(df["raw_date"])

# Convert PM2.5 values to float and change units to µg/m³
df["pm25_kg_m3"] = df["pm25_kg_m3"].astype(float)
df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
df["pm25_ug_m3"] = df["pm25_kg_m3"] * 1e9  # 1 kg = 1e9 µg

# Function to average by day
def average_daily(data, date_col, value_col):
    data = data.copy()
    data["day"] = data[date_col].dt.date
    daily = data.groupby("day")[value_col].mean().reset_index()
    daily["day"] = pd.to_datetime(daily["day"])

    plt.figure(figsize=(10, 4))
    plt.plot(daily["day"], daily[value_col], color="purple", lw=1.2)
    plt.title("Daily Avg PM2.5 – Port of Shanghai")
    plt.xlabel("Date")
    plt.ylabel("PM2.5 (µg/m³)")
    plt.tight_layout()
    plt.show()

    return daily

# Function to average by week
def average_weekly(data, date_col, value_col):
    data = data.copy()
    data["week"] = data[date_col].dt.to_period("W").apply(lambda r: r.start_time)
    weekly = data.groupby("week")[value_col].mean().reset_index()

    plt.figure(figsize=(10, 4))
    plt.plot(weekly["week"], weekly[value_col], color="green", lw=1.2)
    plt.title("Weekly Avg PM2.5 – Port of Shanghai")
    plt.xlabel("Week")
    plt.ylabel("PM2.5 (µg/m³)")
    plt.tight_layout()
    plt.show()

    return weekly

# Function to average by month
def average_monthly(data, date_col, value_col):
    data = data.copy()
    data["month"] = data[date_col].dt.to_period("M").dt.to_timestamp()
    monthly = data.groupby("month")[value_col].mean().reset_index()

    plt.figure(figsize=(10, 4))
    plt.plot(monthly["month"], monthly[value_col], color="blue", lw=1.2)
    plt.title("Monthly Avg PM2.5 – Port of Shanghai")
    plt.xlabel("Month")
    plt.ylabel("PM2.5 (µg/m³)")
    plt.tight_layout()
    plt.show()

    return monthly

# Run the averaging function of choice
# daily_data = average_daily(df, "date", "pm25_ug_m3")
# weekly_data = average_weekly(df, "date", "pm25_ug_m3")
monthly_data = average_monthly(df, "date", "pm25_ug_m3")

# Hourly to Daily/Weekly/Monthly Data (Second attempt)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Set the date range for filtering
start_date = "2011-01-01"
end_date = "2025-01-01"

# Load and clean the data
def load_data(csv_file, start=start_date, end=end_date):
    df = pd.read_csv(csv_file)

    # Remove top metadata rows
    df = df.iloc[5:].reset_index(drop=True)
    df.columns = ["raw_date", "pm25_kg_m3"]

    # Keep rows with proper dates
    df = df[df["raw_date"].str.match(r"\d{4}-\d{2}-\d{2}")].reset_index(drop=True)

    # Convert to datetime and numeric
    df["date"] = pd.to_datetime(df["raw_date"])
    df["pm25_kg_m3"] = df["pm25_kg_m3"].astype(float)

    # Filter by date range
    df = df[(df["date"] >= start) & (df["date"] <= end)]

    # Convert from kg/m³ to µg/m³
    df["pm25_ug_m3"] = df["pm25_kg_m3"] * 1e9

    return df

# Daily average
def average_daily(df, date_col="date", value_col="pm25_ug_m3"):
    df = df.copy()
    df["day"] = df[date_col].dt.date
    daily_avg = df.groupby("day")[value_col].mean().reset_index()
    daily_avg["day"] = pd.to_datetime(daily_avg["day"])
    daily_avg.rename(columns={value_col: f"daily_avg_{value_col}"}, inplace=True)
    return daily_avg

# Weekly average
def average_weekly(df, date_col="date", value_col="pm25_ug_m3"):
    df = df.copy()
    df["week"] = df[date_col].dt.to_period("W").apply(lambda r: r.start_time)
    weekly_avg = df.groupby("week")[value_col].mean().reset_index()
    weekly_avg.rename(columns={value_col: f"weekly_avg_{value_col}"}, inplace=True)
    return weekly_avg

# Monthly average
def average_monthly(df, date_col="date", value_col="pm25_ug_m3"):
    df = df.copy()
    df["month"] = df[date_col].dt.to_period("M").dt.to_timestamp()
    monthly_avg = df.groupby("month")[value_col].mean().reset_index()
    monthly_avg.rename(columns={value_col: f"monthly_avg_{value_col}"}, inplace=True)
    return monthly_avg

# Plotting helper
def plot_timeseries(df, x_col, y_col, title, xlabel, ylabel="PM2.5 (µg/m³)"):
    plt.figure(figsize=(10, 4))
    plt.plot(df[x_col], df[y_col], lw=1.5)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.show()

# Plot each type using its own function
def plot_daily(df, value_col=None):
    if value_col is None:
        value_col = df.columns[1]
    plot_timeseries(df, "day", value_col,
                    title="Daily Avg PM2.5 – Port of Shanghai",
                    xlabel="Date")

def plot_weekly(df, value_col=None):
    if value_col is None:
        value_col = df.columns[1]
    plot_timeseries(df, "week", value_col,
                    title="Weekly Avg PM2.5 – Port of Shanghai",
                    xlabel="Week")

def plot_monthly(df, value_col=None):
    if value_col is None:
        value_col = df.columns[1]
    plot_timeseries(df, "month", value_col,
                    title="Monthly Avg PM2.5 – Port of Shanghai",
                    xlabel="Month")

In [None]:
# Filepath
raw_df = load_data("nasa_data/port_of_shanghai_pm25_hourly.csv")

daily = average_daily(raw_df)
weekly = average_weekly(raw_df)
monthly = average_monthly(raw_df)

# Uncomment the plots you need
plot_daily(daily)
plot_weekly(weekly)
plot_monthly(monthly)

## Overlayed by year

In [None]:
# Config
start_date = "2015-01-01"  # Start of date range
end_date = "2025-12-31"    # End of date range
value_col = "pm25_ug_m3"   # Column to analyze

# Filter the data to just the date window we want
filtered_df = df[(df["date"] >= start_date) & (df["date"] <= end_date)].copy()

# Calculate weekly averages using the helper function
weekly_data = average_weekly(filtered_df, "date", value_col)

# Extract year and ISO week number for grouping later
weekly_data["year"] = weekly_data["week"].dt.isocalendar().year
weekly_data["iso_week"] = weekly_data["week"].dt.isocalendar().week

# Make sure every year has 53 weeks (fill missing ones with NaN)
full_weeks = list(range(1, 54))
all_years = sorted(weekly_data["year"].unique())
padded_weeks = []

for year in all_years:
    year_data = weekly_data[weekly_data["year"] == year][["iso_week", f"weekly_avg_{value_col}"]]
    
    # Create a full 53-week frame and merge with actual data
    pad_frame = pd.DataFrame({"iso_week": full_weeks})
    pad_frame = pad_frame.merge(year_data, on="iso_week", how="left")
    pad_frame["year"] = year
    
    padded_weeks.append(pad_frame)

# Combine everything into a single DataFrame for plotting
plot_ready_df = pd.concat(padded_weeks, ignore_index=True)

# Plot weekly averages for each year as a separate line
plt.figure(figsize=(12, 5))

for year, group in plot_ready_df.groupby("year"):
    plt.plot(
        group["iso_week"],
        group[f"weekly_avg_{value_col}"],
        marker="o",
        linewidth=1.3,
        label=str(year)
    )

plt.title(f"Weekly Avg PM2.5 – Port of Shanghai\n({start_date} to {end_date})")
plt.xlabel("Week Number (1–53)")
plt.ylabel("PM2.5 (µg/m³)")
plt.xlim(1, 53)
plt.xticks(range(1, 54, 4))
plt.grid(alpha=0.3)
plt.legend(title="Year", ncol=2, fontsize=8)
plt.tight_layout()
plt.show()

# NO2 Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Define date range and special fill value used in the dataset
start_date = "2014-01-01"
end_date = "2025-01-01"
fill_value = -1.267651e30  # This value is used to mark missing data

# Load and clean the NO₂ dataset
def load_no2_data(csv_file, start=start_date, end=end_date):
    df = pd.read_csv(csv_file)

    # Skip metadata rows at the top
    df = df.iloc[5:].reset_index(drop=True)
    df.columns = ["raw_date", "no2_mol_cm2"]

    # Keep only valid date rows
    df = df[df["raw_date"].str.match(r"\d{4}-\d{2}-\d{2}")].reset_index(drop=True)

    # Convert columns to proper types
    df["date"] = pd.to_datetime(df["raw_date"])
    df["no2_mol_cm2"] = df["no2_mol_cm2"].astype(float)

    # Replace fill values with NaN
    df["no2_mol_cm2"] = df["no2_mol_cm2"].replace(fill_value, np.nan)

    # Optional: remove any negative values (commented out)
    # df.loc[df["no2_mol_cm2"] < 0, "no2_mol_cm2"] = pd.NA

    # Filter to selected date range
    df = df[(df["date"] >= start) & (df["date"] <= end)]

    return df

# Daily averaging function
def average_no2_daily(df, date_col="date", value_col="no2_mol_cm2"):
    df = df.copy()
    df["day"] = df[date_col].dt.date
    daily = df.groupby("day")[value_col].mean().reset_index()
    daily["day"] = pd.to_datetime(daily["day"])
    daily.rename(columns={value_col: f"daily_avg_{value_col}"}, inplace=True)
    return daily

# Weekly averaging function
def average_no2_weekly(df, date_col="date", value_col="no2_mol_cm2"):
    df = df.copy()
    df["week"] = df[date_col].dt.to_period("W").apply(lambda r: r.start_time)
    weekly = df.groupby("week")[value_col].mean().reset_index()
    weekly.rename(columns={value_col: f"weekly_avg_{value_col}"}, inplace=True)
    return weekly

# Monthly averaging function
def average_no2_monthly(df, date_col="date", value_col="no2_mol_cm2"):
    df = df.copy()
    df["month"] = df[date_col].dt.to_period("M").dt.to_timestamp()
    monthly = df.groupby("month")[value_col].mean().reset_index()
    monthly.rename(columns={value_col: f"monthly_avg_{value_col}"}, inplace=True)
    return monthly

# Generic plotting function
def plot_no2_timeseries(df, x_col, y_col, title, xlabel, ylabel="NO₂ (mol/m²)"):
    plt.figure(figsize=(10, 4))
    plt.plot(df[x_col], df[y_col], lw=1.5)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.show()

# Helper plots for each time scale
def plot_no2_daily(daily_df, value_col=None):
    if value_col is None:
        value_col = daily_df.columns[1]
    plot_no2_timeseries(daily_df, "day", value_col, "Daily Avg NO₂ – Port of Shanghai", "Date")

def plot_no2_weekly(weekly_df, value_col=None):
    if value_col is None:
        value_col = weekly_df.columns[1]
    plot_no2_timeseries(weekly_df, "week", value_col, "Weekly Avg NO₂ – Port of Shanghai", "Week")

def plot_no2_monthly(monthly_df, value_col=None):
    if value_col is None:
        value_col = monthly_df.columns[1]
    plot_no2_timeseries(monthly_df, "month", value_col, "Monthly Avg NO₂ – Port of Shanghai", "Month")

In [None]:
no2_df = load_no2_data("nasa_data/port_of_shanghai_NO2_daily.csv")

# Compute averages
daily = average_no2_daily(no2_df)
weekly = average_no2_weekly(no2_df)
monthly = average_no2_monthly(no2_df)

# Plots
plot_no2_daily(daily)
plot_no2_weekly(weekly)
plot_no2_monthly(monthly)


# SO2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Config
start_date = "2011-01-01"
end_date = "2025-01-01"

def load_so2_data(csv_file, start=start_date, end=end_date):
    df = pd.read_csv(csv_file)
    df = df.iloc[5:].reset_index(drop=True)
    df.columns = ["raw_date", "so2_kg_m3"]

    df = df[df["raw_date"].str.match(r"\d{4}-\d{2}-\d{2}")].reset_index(drop=True)
    df["date"] = pd.to_datetime(df["raw_date"])
    df["so2_kg_m3"] = df["so2_kg_m3"].astype(float)

    df = df[(df["date"] >= start) & (df["date"] <= end)]
    df["so2_ug_m3"] = df["so2_kg_m3"] * 1e9
    return df

def average_so2_daily(df, date_col="date", value_col="so2_ug_m3"):
    df = df.copy()
    df["day"] = df[date_col].dt.date
    daily = df.groupby("day")[value_col].mean().reset_index()
    daily["day"] = pd.to_datetime(daily["day"])
    daily.rename(columns={value_col: f"daily_avg_{value_col}"}, inplace=True)
    return daily

def average_so2_weekly(df, date_col="date", value_col="so2_ug_m3"):
    df = df.copy()
    df["week"] = df[date_col].dt.to_period("W").apply(lambda r: r.start_time)
    weekly = df.groupby("week")[value_col].mean().reset_index()
    weekly.rename(columns={value_col: f"weekly_avg_{value_col}"}, inplace=True)
    return weekly

def average_so2_monthly(df, date_col="date", value_col="so2_ug_m3"):
    df = df.copy()
    df["month"] = df[date_col].dt.to_period("M").dt.to_timestamp()
    monthly = df.groupby("month")[value_col].mean().reset_index()
    monthly.rename(columns={value_col: f"monthly_avg_{value_col}"}, inplace=True)
    return monthly

def plot_so2_timeseries(df, x_col, y_col, title, xlabel, ylabel="SO₂ (µg/m³)"):
    plt.figure(figsize=(10, 4))
    plt.plot(df[x_col], df[y_col], lw=1.5)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.show()

def plot_so2_daily(df, value_col=None):
    if value_col is None:
        value_col = df.columns[1]
    plot_so2_timeseries(df, "day", value_col, "Daily Avg SO₂ – Port of Shanghai", "Date")

def plot_so2_weekly(df, value_col=None):
    if value_col is None:
        value_col = df.columns[1]
    plot_so2_timeseries(df, "week", value_col, "Weekly Avg SO₂ – Port of Shanghai", "Week")

def plot_so2_monthly(df, value_col=None):
    if value_col is None:
        value_col = df.columns[1]
    plot_so2_timeseries(df, "month", value_col, "Monthly Avg SO₂ – Port of Shanghai", "Month")

In [None]:
so2_df = load_so2_data("nasa_data/port_of_shanghai_SO2_hourly.csv")

daily_so2 = average_so2_daily(so2_df)
weekly_so2 = average_so2_weekly(so2_df)
monthly_so2 = average_so2_monthly(so2_df)

plot_so2_daily(daily_so2)
plot_so2_weekly(weekly_so2)
plot_so2_monthly(monthly_so2)