In [79]:
# This is for google collab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings

warnings.filterwarnings('ignore', category=FutureWarning, message='.*observed=False.*')

In [12]:

dataset = '/content/drive/My Drive/Visualisation in data science/data/madrid_combined.csv'

df = pd.read_csv(dataset)
print(df.head(1))

                  date  BEN    CO  EBE  MXY  NMHC       NO_2        NOx  OXY  \
0  2001-08-01 01:00:00  NaN  0.37  NaN  NaN   NaN  58.400002  87.150002  NaN   

         O_3   PM10  PXY  SO_2  TCH  TOL   station  year  PM25  NO  CH4  
0  34.529999  105.0  NaN  6.34  NaN  NaN  28079001  2001   NaN NaN  NaN  


In [80]:
# Data preparation
df['date'] = pd.to_datetime(df['date'])
df['day_of_week'] = df['date'].dt.day_name()
df['hour'] = df['date'].dt.hour
df['month'] = df['date'].dt.month
df['month_name'] = df['date'].dt.month_name()

# Get list of available pollutants (excluding non-pollutant columns)
non_pollutant_cols = ['date', 'day_of_week', 'hour', 'station', 'year', 'month', 'month_name']
# Only include pollutants that have sufficient non-NaN data
pollutants = [col for col in df.columns if col not in non_pollutant_cols
              and df[col].notna().sum() > 100]  # Threshold for non-NaN values

# Get list of available stations - format as tuples for dropdown
stations = [(str(station), station) for station in df['station'].unique()]

# Get list of available months - create a proper dictionary for lookup
available_months = sorted(df['month'].unique())
month_names = {m: pd.Timestamp(2023, m, 1).strftime('%B') for m in available_months}
month_options = [(month_names[m], m) for m in available_months]

# Create dropdown widgets
pollutant_dropdown = widgets.Dropdown(
    options=pollutants,
    value=pollutants[0] if pollutants else None,
    description='Pollutant:',
    style={'description_width': 'initial'},
    layout={'width': '300px'}
)

# Add "All Stations" option - properly formatted as a tuple
station_dropdown = widgets.Dropdown(
    options=[('All Stations', 'all')] + stations,
    value='all',  # Default to All Stations
    description='Station:',
    style={'description_width': 'initial'},
    layout={'width': '300px'}
)

month_dropdown = widgets.Dropdown(
    options=[('All Months', 0)] + month_options,
    value=0,
    description='Month:',
    style={'description_width': 'initial'},
    layout={'width': '300px'}
)

# Days order for consistent display
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df['day_of_week'] = pd.Categorical(df['day_of_week'], categories=days_order, ordered=True)

# Create formatted time labels (12-hour format with AM/PM)
time_labels = [(f"{h%12 if h%12 else 12}:00 {'AM' if h<12 else 'PM'}") for h in range(24)]

In [81]:
# Define the plotting function with all filters and improved error handling
def update_plot(pollutant, station, month):
    if not pollutant:
        plt.figure(figsize=(10, 6))
        plt.text(0.5, 0.5, "Missing pollutant selection",
                 ha='center', va='center', fontsize=14)
        plt.axis('off')
        plt.show()
        return

    # Filter data based on selections
    filtered_df = df.copy()

    # Apply station filter (unless "All Stations" is selected)
    if station != 'all':
        filtered_df = filtered_df[filtered_df['station'] == station]

    # Apply month filter (if not "All Months")
    if month != 0:
        filtered_df = filtered_df[filtered_df['month'] == month]

    # Check if we have data after filtering
    if len(filtered_df) == 0:
        plt.figure(figsize=(10, 6))
        plt.text(0.5, 0.5, "No data available for the selected filters",
                 ha='center', va='center', fontsize=14)
        plt.axis('off')
        plt.show()
        return

    # Check if selected pollutant has any non-NaN values in filtered data
    valid_data_count = filtered_df[pollutant].notna().sum()
    if valid_data_count == 0:
        plt.figure(figsize=(10, 6))
        station_text = "All Stations" if station == 'all' else station
        month_text = "All Months" if month == 0 else month_names.get(month, str(month))
        plt.text(0.5, 0.5, f"No valid {pollutant} data for Station: {station_text}, Month: {month_text}",
                 ha='center', va='center', fontsize=14)
        plt.axis('off')
        plt.show()
        return

    # Create a pivot table with hours as rows, days as columns
    try:
        # Filter out NaN values for the selected pollutant before grouping
        valid_df = filtered_df[filtered_df[pollutant].notna()]

        # Create pivot table only if we have valid data
        if len(valid_df) > 0:
            pollution_by_hour_day = valid_df.groupby(['hour', 'day_of_week'])[pollutant].mean().unstack(fill_value=0)

            # Check if we have all days of the week - if not, add missing days with zeros
            for day in days_order:
                if day not in pollution_by_hour_day.columns:
                    pollution_by_hour_day[day] = 0

            # Reorder columns to ensure days are in correct order
            pollution_by_hour_day = pollution_by_hour_day[days_order]

            # Ensure we have all 24 hours - if not, add missing hours with zeros
            for hour in range(24):
                if hour not in pollution_by_hour_day.index:
                    pollution_by_hour_day.loc[hour] = 0

            # Sort by hour
            pollution_by_hour_day = pollution_by_hour_day.sort_index()

            # Calculate daily totals for the histogram
            daily_totals = valid_df.groupby('day_of_week')[pollutant].mean()

            # Ensure daily_totals has all days
            for day in days_order:
                if day not in daily_totals.index:
                    daily_totals[day] = 0

            # Reorder to ensure days are in correct order
            daily_totals = daily_totals.reindex(days_order, fill_value=0)

            # Create figure with subplots
            fig = plt.figure(figsize=(12, 10))
            gs = GridSpec(5, 1, height_ratios=[1, 4, 0.1, 0.2, 0.2])

            # Create heatmap subplot
            ax_heatmap = plt.subplot(gs[1])

            # Create heatmap with explicit vmin and vmax
            max_val = pollution_by_hour_day.max().max()
            if max_val > 0:
                vmax = max_val * 1.1
            else:
                vmax = 1  # Default if all values are zero

            heatmap = sns.heatmap(pollution_by_hour_day, cmap="Reds", ax=ax_heatmap, cbar=False,
                                 vmin=0, vmax=vmax)

            ax_heatmap.set_ylabel('Hours of the day')
            ax_heatmap.set_xlabel('')

            # Set formatted time labels for the y-axis
            ax_heatmap.set_yticklabels(time_labels, fontsize=9, rotation=0)

            # Make sure day labels are clearly visible on the x-axis
            ax_heatmap.set_xticklabels(days_order, fontsize=10, rotation=0)

            # Get the x-positions from the heatmap
            x_positions = np.arange(len(days_order)) + 0.5  # Center positions for each day column

            # Create histogram subplot with aligned x-positions
            ax_hist = plt.subplot(gs[0], sharex=ax_heatmap)
            ax_hist.bar(x_positions, daily_totals, width=0.8, color='lightblue', edgecolor='blue', alpha=0.7)
            ax_hist.set_ylabel('Number of exceedances')
            ax_hist.spines['top'].set_visible(False)
            ax_hist.spines['right'].set_visible(False)

            # Make sure both plots have the same x range
            ax_hist.set_xlim(0, len(days_order))

            # Add colorbar
            ax_colorbar = plt.subplot(gs[2])
            plt.colorbar(ax_heatmap.collections[0], cax=ax_colorbar, orientation='horizontal')
            ax_colorbar.set_title(f'{pollutant} Concentration')

            # Create subtitle with filter information
            station_text = "All Stations" if station == 'all' else station
            month_text = "All Months" if month == 0 else month_names.get(month, str(month))

            # If showing all stations, include station count
            if station == 'all':
                station_count = filtered_df['station'].nunique()
                station_text = f"All Stations ({station_count})"

            subtitle = f"Station: {station_text} | Month: {month_text} | Valid readings: {valid_data_count}"

            # Add title and subtitle
            plt.suptitle(f'MA_51_1: Weekly Pollution Patterns - {pollutant}', fontsize=16, fontweight='bold', y=0.98)
            fig.text(0.5, 0.92, subtitle, ha='center', fontsize=12)

            plt.tight_layout(rect=[0, 0, 1, 0.90])

            # Hide x-tick labels on histogram but keep ticks
            ax_hist.tick_params(axis='x', which='both', bottom=False, labelbottom=False)

            plt.show()
        else:
            plt.figure(figsize=(10, 6))
            plt.text(0.5, 0.5, f"No valid data available for {pollutant} with the selected filters",
                     ha='center', va='center', fontsize=14)
            plt.axis('off')
            plt.show()

    except Exception as e:
        plt.figure(figsize=(10, 6))
        plt.text(0.5, 0.5, f"Error creating visualization: {str(e)}",
                ha='center', va='center', fontsize=14)
        plt.axis('off')
        plt.show()
        print(f"Error details: {str(e)}")

# Connect the dropdown widgets to the plotting function
widgets.interact(update_plot,
                 pollutant=pollutant_dropdown,
                 station=station_dropdown,
                 month=month_dropdown)

interactive(children=(Dropdown(description='Pollutant:', layout=Layout(width='300px'), options=('BEN', 'CO', '…