In [2]:
import pandas as pd 
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [3]:
df = pd.read_csv("cities_air_quality_water_pollution.18-10-2021.csv")
df

Unnamed: 0,City,"""Region""","""Country""","""AirQuality""","""WaterPollution"""
0,New York City,"""New York""","""United States of America""",46.816038,49.504950
1,"Washington, D.C.","""District of Columbia""","""United States of America""",66.129032,49.107143
2,San Francisco,"""California""","""United States of America""",60.514019,43.000000
3,Berlin,"""""","""Germany""",62.364130,28.612717
4,Los Angeles,"""California""","""United States of America""",36.621622,61.299435
...,...,...,...,...,...
3958,Yanbu,"""Medina Province""","""Saudi Arabia""",0.000000,50.000000
3959,Cordoba,"""Andalusia""","""Spain""",85.714286,8.333333
3960,Vic,"""Catalonia""","""Spain""",100.000000,0.000000
3961,Segovia,"""Castile and Leon""","""Spain""",100.000000,0.000000


### Cleaning the data

In [4]:
df = df.rename(columns={
    ' "Region"': 'Region',
    ' "Country"': 'Country',
    ' "AirQuality"': 'AirQuality',
    ' "WaterPollution"': 'WaterPollution'
})

In [5]:
cols = ['City', 'Region', 'Country']

df[cols] = df[cols].apply(lambda s: s.str.strip().str.replace('"', '', regex=False))

In [6]:
df

Unnamed: 0,City,Region,Country,AirQuality,WaterPollution
0,New York City,New York,United States of America,46.816038,49.504950
1,"Washington, D.C.",District of Columbia,United States of America,66.129032,49.107143
2,San Francisco,California,United States of America,60.514019,43.000000
3,Berlin,,Germany,62.364130,28.612717
4,Los Angeles,California,United States of America,36.621622,61.299435
...,...,...,...,...,...
3958,Yanbu,Medina Province,Saudi Arabia,0.000000,50.000000
3959,Cordoba,Andalusia,Spain,85.714286,8.333333
3960,Vic,Catalonia,Spain,100.000000,0.000000
3961,Segovia,Castile and Leon,Spain,100.000000,0.000000


### Preprocessing data for the subplots

In [18]:
targets = (0, 25, 50, 75, 100)

air_counts = (
    df["AirQuality"]
    .where(df["AirQuality"].isin(targets), "Non exact values")
    .replace(targets, "Exact values")
    .value_counts()
)

water_counts = (
    df["WaterPollution"]
    .where(df["WaterPollution"].isin(targets), "Non exact values")
    .replace(targets, "Exact values")
    .value_counts()
)


summary_df = pd.DataFrame({
    "AirQuality": air_counts,
    "WaterPollution": water_counts
}).fillna(0).astype(int)

summary_df

Unnamed: 0,AirQuality,WaterPollution
Exact values,1787,2275
Non exact values,2176,1688


### Creating the Visualisation

In [28]:
# Creating subplots

fig = make_subplots(
    rows=2,
    cols=2,
    column_widths=[0.75, 0.25],
    row_heights=[0.7, 0.7],
    vertical_spacing=0.07,
    specs=[
        [{"type": "histogram"}, {"type": "pie"}],
        [{"type": "histogram"}, {"type": "pie"}]
    ],
    subplot_titles=[
        "Air Quality Histogram",
        "Distribution of exact values  <br> vs non exact values",
        "Water Pollution Histogram",
        ""
    ]
)

# AirQuality Histogram

fig.add_trace(
    go.Histogram(
        x=df["AirQuality"],
        xbins=dict(
            start=0,
            end=101,
            size=5
        ),
        name="AirQuality",
        marker_color="slategrey",
        showlegend=False
    ),
    row=1, col=1
)


# WaterPollution Histogram

fig.add_trace(
    go.Histogram(
        x=df["WaterPollution"],
        xbins=dict(
            start=0,
            end=101,
            size=5
        ),
        name="WaterPollution",
        marker_color="slategrey",
        showlegend=False
    ),
    row=2, col=1
)
fig.update_yaxes(title_text="count", row=1, col=1)
fig.update_yaxes(title_text="count", row=2, col=1)


# AirQuality Pie Chart

fig.add_trace(
    go.Pie(
        labels=["Exact values: 0, 25, 50, 75, 100", "Non exact values"],
        values=summary_df["AirQuality"],
        marker_colors=["royalblue", "slategrey"],
        name="AirQuality",
        showlegend=True
    ),
    row=1, col=2
)

# WaterPollution Pie Chart

fig.add_trace(
    go.Pie(
        labels=["Exact values: 0, 25, 50, 75, 100", "Non exact values"],
        values=summary_df["WaterPollution"],
        marker_colors=["royalblue", "slategrey"],
        rotation=180,
        name="WaterPollution",
        showlegend=False
    ),
    row=2, col=2
)


# Adding pie chart annotations

fig.add_annotation(
    text="Note: Values not exactly equal to 0,25,50,75 or 100 <br>are grouped as 'Non exact values'",
    x=1.05, y=0.49,
    xref="paper", yref="paper",
    showarrow=False,
    font=dict(size=10, color="gray"),
    align="left"
)
domain = fig.data[2].domain
fig.add_annotation(
    text="For Air Quality",
    x=(domain['x'][0] + domain['x'][1]) / 1.85,  # center of pie
    y=(domain['y'][0] + domain['y'][1]) / 1.57,  # center of pie
    xref="paper",
    yref="paper",
    showarrow=False,
    font=dict(size=16, color="gray")
)

domain = fig.data[3].domain
fig.add_annotation(
    text="For Water Pollution",
    x=(domain['x'][0] + domain['x'][1]) / 1.8,  # center of pie
    y=(domain['y'][0] + domain['y'][1]) / 1.1,  # center of pie
    xref="paper",
    yref="paper",
    showarrow=False,
    font=dict(size=16, color="gray")
)

# Highlighting the exact value bars on the histograms

color = "royalblue"

x_ranges = [(0, 5), (25, 30), (50, 55), (75, 80), (100, 105)]

y_values = {
    ("x1", "y1"): [283, 242, 323, 469, 691],
    ("x2", "y2"): [447, 380, 1145, 309, 187],
}

for (xref, yref), heights in y_values.items():
    for (x0, x1), y1 in zip(x_ranges, heights):
        fig.add_shape(
            type="rect",
            x0=x0, x1=x1,
            y0=0, y1=y1,
            xref=xref, yref=yref,
            fillcolor=color,
            line_width=0
        )

#Layout tweaks

fig.update_layout(
    autosize=False,
    width=1000,
    height=800,
    showlegend=True,
    title_text="",
    bargap=0.01,
    legend=dict(
        title="",
        x=0.7,  
        y=0.55
    )
    
)

fig.show()

In [27]:
fig.write_html("air_water_quality.html", include_plotlyjs='cdn', config={"responsive": True})