In [4]:
import pandas as pd
import numpy as np
import os

folder_path = "."
file_names = [f"d{y}_v3.csv" for y in range(23, 15, -1)]
dfs = {}

for file in file_names:
    df = pd.read_csv(os.path.join(folder_path, file))
    df.columns = df.columns.str.strip().str.lower()
    df["quantity released (kg)"] = pd.to_numeric(df["quantity released (kg)"], errors="coerce").fillna(0)
    df["substance name"] = df["substance name"].astype(str).str.lower()
    df["region"] = df["region"].astype(str).str.strip().str.lower()
    df["year"] = int(file[1:3]) + 2000
    dfs[file.replace("_v3.csv", "")] = df

combined_df = pd.concat(dfs.values(), ignore_index=True)

valid_regions = [
    "north east", "north west", "yorkshire and the humber", "east midlands",
    "west midlands", "east of england", "london", "south east", "south west"
]
combined_df = combined_df[combined_df["region"].isin(valid_regions)]

combined_df["reporting threshold (kg)"] = pd.to_numeric(combined_df["reporting threshold (kg)"], errors='coerce')
combined_df["operator name"] = combined_df["operator name"].astype(str).str.lower()
combined_df["site address"] = combined_df["site address"].astype(str).str.strip()

top5_substances = (
    combined_df.groupby("substance name")["quantity released (kg)"]
    .sum()
    .sort_values(ascending=False)
    .head(5)
    .index.tolist()
)

top_df = combined_df[combined_df["substance name"].isin(top5_substances)].copy()
qty = top_df["quantity released (kg)"].dropna()
threshold = top_df["reporting threshold (kg)"].mean()

print(f"\n===== Combined Summary for Top 5 Substances =====")
print(f"Top 5 Substances: {top5_substances}")
print(f"Mean: {qty.mean():.4f}")
print(f"Std Dev: {qty.std():.4f}")
print(f"Min: {qty.min():.4f}")
print(f"Max: {qty.max():.4f}")
print(f"5th Percentile: {np.percentile(qty, 5):.4f}")
print(f"10th Percentile: {np.percentile(qty, 10):.4f}")
print(f"25th Percentile: {np.percentile(qty, 25):.4f}")
print(f"Median (50th Percentile): {np.percentile(qty, 50):.4f}")
print(f"75th Percentile: {np.percentile(qty, 75):.4f}")
print(f"90th Percentile: {np.percentile(qty, 90):.4f}")
print(f"95th Percentile: {np.percentile(qty, 95):.4f}")
print(f"Reporting Threshold (mean): {threshold:.4f}")
print(f"Nb of Obs: {len(qty)}")
print(f"Nb Below Threshold: {(qty < threshold).sum()}")
print(f"Nb of Different Sites: {top_df['site address'].nunique()}")

mean_years_per_site = top_df.groupby("site address")["year"].nunique().mean()
print(f"Mean Years per Site: {mean_years_per_site:.2f}")

urban_share = (top_df["region"] == "urban").mean()
print(f"Urban Share: {urban_share:.2%}")

wastewater_keywords = ["water", "wastewater", "sewerage"]
top_df["is_wastewater"] = top_df["operator name"].apply(
    lambda x: any(word in x for word in wastewater_keywords)
)
print(f"Wastewater Company Share: {top_df['is_wastewater'].mean():.2%}")

print("\n--- Region Shares ---")
for region in valid_regions:
    share = (top_df["region"] == region).mean()
    print(f"{region.title()}: {share:.2%}")

print("\n--- Year Shares ---")
for year in range(2016, 2024):
    share = (top_df["year"] == year).mean()
    print(f"{year}: {share:.2%}")



===== Combined Summary for Top 5 Substances =====
Top 5 Substances: ['indeno(1,2,3-cd)pyrene', 'benzo(a)pyrene', 'fluoranthene', 'cypermethrin', 'heptachlor']
Mean: 0.6260
Std Dev: 2.8597
Min: 0.0000
Max: 100.0000
5th Percentile: 0.0000
10th Percentile: 0.0000
25th Percentile: 0.0000
Median (50th Percentile): 0.0000
75th Percentile: 0.1900
90th Percentile: 1.6000
95th Percentile: 2.7500
Reporting Threshold (mean): 0.5879
Nb of Obs: 4277
Nb Below Threshold: 3481
Nb of Different Sites: 413
Mean Years per Site: 3.87
Urban Share: 0.00%
Wastewater Company Share: 60.18%

--- Region Shares ---
North East: 6.59%
North West: 19.85%
Yorkshire And The Humber: 13.47%
East Midlands: 7.27%
West Midlands: 10.54%
East Of England: 12.37%
London: 4.40%
South East: 16.79%
South West: 8.72%

--- Year Shares ---
2016: 12.11%
2017: 11.25%
2018: 12.02%
2019: 12.58%
2020: 12.53%
2021: 13.79%
2022: 13.65%
2023: 12.06%


In [7]:
combined_df["route name"] = combined_df["route name"].astype(str).str.strip().str.lower()
unique_routes = combined_df["route name"].dropna().unique()
print("Unique route names:")
for route in sorted(unique_routes):
    print(route)


Unique route names:
controlled waters
wastewater


In [8]:
combined_df["substance name"] = combined_df["substance name"].astype(str).str.lower()
combined_df["quantity released (kg)"] = pd.to_numeric(combined_df["quantity released (kg)"], errors="coerce").fillna(0)

top5_substances = (
    combined_df.groupby("substance name")["quantity released (kg)"]
    .sum()
    .sort_values(ascending=False)
    .head(5)
    .index.tolist()
)

top5_df = combined_df[combined_df["substance name"].isin(top5_substances)]

total_quantity_top5 = top5_df["quantity released (kg)"].sum()

print("Top 5 substances:", top5_substances)
print(f"Total quantity released (top 5 substances combined): {total_quantity_top5:.2f} kg")


Top 5 substances: ['indeno(1,2,3-cd)pyrene', 'benzo(a)pyrene', 'fluoranthene', 'cypermethrin', 'heptachlor']
Total quantity released (top 5 substances combined): 2677.54 kg
