In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt

def plot_pay(year):
    """
    Year goes from 2010 to 2022 (incl)
    
    a) First, it estimates the houshold income per borough using average household size,
       employement rate and median weekly work-based earning pp.
       
    b) Then, it plots a map per borough with a color gradient showing houshold income 
       per borough for @year.
    """
    # Validate the input year
    if year < 2010 or year > 2022:
        raise ValueError("Year must be between 2010 and 2022")

    # Load the shapefile
    shapefile_path = "London-wards-2018/London-wards-2018_ESRI/London_Ward.shp"
    gdf = gpd.read_file(shapefile_path)

    # Dissolve the geometries by the borough column to aggregate wards into boroughs
    boroughs = gdf.dissolve(by='DISTRICT').reset_index()

    # Load the pay CSV data
    pay_csv_path = "pay.csv"
    pay_data = pd.read_csv(pay_csv_path)
    
    # Extract relevant columns for the specified year
    year_str = str(year)
    pay_data = pay_data[['Area', year_str]]

    # Replace non-numeric values ('#', '!') with NaN and convert to float
    pay_data[year_str] = pd.to_numeric(pay_data[year_str].replace(['#', '!'], np.nan), errors='coerce')

    # Set the 'Area' column as the index
    pay_data = pay_data.set_index('Area')

    # Rename the column for consistency
    pay_data.columns = ['Pay']

    # Adjust the name in the CSV data to match 'City of Westminster'
    pay_data = pay_data.rename(index={'Westminster': 'City of Westminster'})

    # Load the unemployment data
    unemployment_csv_path = "unemployement_edit.csv"
    unemployment_data = pd.read_csv(unemployment_csv_path)

    # Extract unemployment rate for the specified year
    unemployment_data['Year'] = pd.to_datetime(unemployment_data['Date']).dt.year
    unemployment_rate = unemployment_data[unemployment_data['Year'] == year]['Unemployment rate: London'].values[0]

    # Compute the employment rate
    employment_rate = 1 - unemployment_rate

    # Calculate the estimated household income
    pay_data['Estimated_Household_Income'] = pay_data['Pay'] * employment_rate * 2.6 * 52

    # Merge the pay data with the borough geometries using 'DISTRICT' and 'Area'
    boroughs = boroughs.merge(pay_data, left_on='DISTRICT', right_index=True, how='left')

    # Check for any boroughs with missing data
    missing_data_boroughs = boroughs[boroughs['Estimated_Household_Income'].isnull()]
    print("Boroughs with missing data:\n", missing_data_boroughs['DISTRICT'].tolist())

    # Plot the borders of the boroughs and fill with estimated household income data for the specified year
    fig, ax = plt.subplots(figsize=(10, 10))
    boroughs.boundary.plot(ax=ax, color='black')
    boroughs.plot(column='Estimated_Household_Income', cmap='Blues_r', legend=True, ax=ax, missing_kwds={"color": "red", "label": "Missing data"})

    # Remove axes and extra edges
    ax.set_axis_off()
    ax.margins(0)

    # Add legend for missing data
    handles, labels = ax.get_legend_handles_labels()
    handles.append(plt.Line2D([0], [0], marker='o', color='w', label='Missing data', markerfacecolor='red', markersize=10))
    ax.legend(handles=handles, loc='upper right')

    plt.title(f'Estimated Household Income per borough in {year} (in £)')
    plt.show()

# Example usage:
plot_pay(2022)


the pattern is the same as in the previous plot (that showcased work-based income pp) 

im multiplying the previously plotted variable by constant terms and plotting it again, of course the visualization wouldnt change ..

I realize what i did was pointless

---

Plot correlation
- maybe that would help to closely observe the relaltionship between trust and work-based income 

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from statsmodels.nonparametric.smoothers_lowess import lowess

def load_pay_data(year):
    # Load the CSV data
    csv_path = "pay.csv"
    csv_data = pd.read_csv(csv_path)

    # Extract relevant columns for the specified year
    year_str = str(year)
    pay_data = csv_data[['Area', year_str]]

    # Replace non-numeric values ('#') with NaN and convert to float
    pay_data[year_str] = pd.to_numeric(pay_data[year_str].replace('#', np.nan), errors='coerce')

    # Remove duplicates
    pay_data = pay_data.drop_duplicates(subset=['Area'])

    # Set the 'Area' column as the index
    pay_data = pay_data.set_index('Area')

    # Rename the column for consistency
    pay_data.columns = ['Pay']

    # Adjust the name in the CSV data to match 'City of Westminster'
    pay_data = pay_data.rename(index={'Westminster': 'City of Westminster'})

    return pay_data

def load_trust_data(year):
    # Load the CSV data for trust in MPS
    csv_path = "pas_data_borough (1).csv"
    csv_data = pd.read_csv(csv_path)

    # Filter the data for 'Trust MPS' measure and the specified year
    trust_mps_data = csv_data[(csv_data['Measure'] == 'Trust MPS') & (csv_data['Date'].str.contains(str(year)))]

    # Extract relevant columns
    trust_data = trust_mps_data[['Borough', 'Proportion']].copy()

    # Replace non-numeric values ('#') with NaN and convert to float
    trust_data.loc[:, 'Proportion'] = pd.to_numeric(trust_data['Proportion'].replace('#', np.nan), errors='coerce')

    # Remove duplicates
    trust_data = trust_data.drop_duplicates(subset=['Borough'])

    # Set the 'Borough' column as the index
    trust_data = trust_data.set_index('Borough')

    # Rename the column for consistency
    trust_data.columns = ['Trust']

    # Adjust the name in the CSV data to match 'City of Westminster'
    trust_data = trust_data.rename(index={'Westminster': 'City of Westminster'})

    return trust_data

def plot_correlation(pay_year, trust_year):
    # Load the pay data
    pay_data = load_pay_data(pay_year)

    # Load the trust data
    trust_data = load_trust_data(trust_year)

    # Merge the datasets
    merged_data = pay_data.merge(trust_data, left_index=True, right_index=True)

    # Calculate the correlation coefficient
    correlation = merged_data.corr().iloc[0, 1]
    print(f"Correlation coefficient between income and trust: {correlation:.2f}")

    # Apply LOWESS smoothing
    lowess_smoothed = lowess(merged_data['Trust'], merged_data['Pay'], frac=0.3)

    # Create the scatter plot
    fig = px.scatter(merged_data, x='Pay', y='Trust',
                     labels={'Pay': 'Median Weekly Work-Based Earnings (£)', 'Trust': 'Trust in MPS Proportion'},
                     title=f'Correlation between Income and Trust in MPS in {pay_year}',
                     hover_name=merged_data.index)

    # Add the LOWESS trend line
    fig.add_trace(
        go.Scatter(
            x=lowess_smoothed[:, 0],
            y=lowess_smoothed[:, 1],
            mode='lines',
            line=dict(color='red', width=2),
            name='LOWESS trend',
            opacity=0.5
        )
    )

    # Update layout for better appearance
    fig.update_layout(
        title=f'Correlation between Income and Trust in MPS in {pay_year}',
        xaxis_title='Median Weekly Work-Based Earnings (£)',
        yaxis_title='Trust in MPS Proportion',
        template='plotly_white'
    )
#     fig.write_image(f'Income_and_Trust_{pay_year}.png')
    # Show plot
    fig.show()

# Example usage
plot_correlation(2019, 2019)


The above scatter plot that shows the correlation between the 2 variables (trust and work based earnings) and shows a non-linear regression line that shows the ‘trend’

- perfect correlation (when the variables are the same) is a diagonal line (e.g. f(x) = x)


The weakcorrelation between these 2 variables suggests that the income of people in London is not something linked to police trust.

It seems that income isn't related with trust in a meaningful way; 

That probably means we cant come up with recommendations to give to MPS. 