In [None]:
import pandas as pd

# Load the EYFSP dataset
df = pd.read_csv('/content/1_eyfsp_headline_measures_2022_2024.csv')

# Check shape and preview first rows
print("Shape:", df.shape)
df.head()


In [None]:
# Clean column names: lowercase, replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('%', 'percent')

# Preview cleaned columns
df.columns
df.describe()


In [None]:
# Filter the data to include only 'Total' rows (no breakdown)
df_total = df[df['breakdown'] == 'Total']

# Now look at how many rows we have for each region
df_total['region_name'].value_counts()


In [None]:
# Convert the column to numeric (very important!)
df_total['gld_percentage'] = pd.to_numeric(df_total['gld_percentage'], errors='coerce')

# Group the total data by region and calculate the average GLD%
region_gld = df_total.groupby('region_name')['gld_percentage'].mean().reset_index()

# Sort from lowest to highest
region_gld = region_gld.sort_values(by='gld_percentage', ascending=True)

# Show the results
region_gld


In [None]:
import pandas as pd

# Load the deprivation dataset
df_deprivation = pd.read_csv('/content/4_eyfsp_headline_measures_deprivation_2022_2024.csv')

# Clean column names
df_deprivation.columns = df_deprivation.columns.str.lower().str.strip().str.replace(' ', '_')

# Show first few rows to understand
df_deprivation.head()


In [None]:
df_deprivation.columns


In [None]:
# What deprivation levels are in the dataset?
df_deprivation['idaci_decile'].unique()


In [None]:
# 1. Remove the 'Total' row
df_deprivation_clean = df_deprivation[df_deprivation['idaci_decile'] != 'Total'].copy()

# 2. Convert 'idaci_decile' and 'gld_percentage' to numeric
df_deprivation_clean['idaci_decile'] = df_deprivation_clean['idaci_decile'].astype(int)
df_deprivation_clean['gld_percentage'] = pd.to_numeric(df_deprivation_clean['gld_percentage'], errors='coerce')

# 3. Group by deprivation level and calculate average GLD%
gld_by_deprivation = df_deprivation_clean.groupby('idaci_decile')['gld_percentage'].mean().reset_index()

# 4. Sort from most deprived (1) to least deprived (10)
gld_by_deprivation = gld_by_deprivation.sort_values('idaci_decile')

# 5. Show it
gld_by_deprivation


In [None]:
import plotly.express as px

# Plot bar chart
fig = px.bar(
    gld_by_deprivation,
    x='idaci_decile',
    y='gld_percentage',
    text=gld_by_deprivation['gld_percentage'].round(1).astype(str) + '%',
    labels={
        'idaci_decile': 'Deprivation Decile (1 = Most Deprived)',
        'gld_percentage': 'Average % Achieving Good Level of Development (GLD)'
    },
    title='GLD% by Deprivation Decile (2022–2024)',
    color='gld_percentage',
    color_continuous_scale='YlOrRd'
)

fig.update_traces(textposition='outside')
fig.update_layout(yaxis_range=[0, gld_by_deprivation['gld_percentage'].max() + 5])
fig.show()


In [None]:
import matplotlib.pyplot as plt

# Clean and convert the column to float
df_deprivation["gld_percentage"] = df_deprivation["gld_percentage"].astype(str).str.replace('%', '').str.strip()
df_deprivation["gld_percentage"] = pd.to_numeric(df_deprivation["gld_percentage"], errors='coerce')

# Group by sex and calculate average GLD %
gender_avg_gld = df_deprivation.groupby("sex")["gld_percentage"].mean().reset_index()

# Prepare data (already cleaned above)
genders = gender_avg_gld["sex"]
values = gender_avg_gld["gld_percentage"]

# Set up plot size
plt.figure(figsize=(8, 5))

# Plot horizontal bar chart
bars = plt.bar(genders, values, color=["#4f83cc", "#f28281", "#c0c0c0"])

# Add title and axis labels
plt.title("🧒 GLD % Comparison by Gender (EYFSP 2022–2024)", fontsize=14)
plt.xlabel("Gender")
plt.ylabel("Average Good Level of Development (%)")
plt.ylim(0, 100)

# Add % labels above each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 1, f"{yval:.1f}%", ha='center', va='bottom', fontsize=11)

# Beautify layout
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()


In [None]:
# Check the available years in the data
df_trend = df.copy()

df_trend["time_period"].unique()



In [None]:
import matplotlib.pyplot as plt

# Filter only national-level data (if needed)
df_england = df_trend[df_trend["country_name"] == "England"]

# Convert time_period to readable string for plotting
df_england["year"] = df_england["time_period"].astype(str).map({
    "202122": "2021–22",
    "202223": "2022–23",
    "202324": "2023–24"
})


# Convert GLD % to float
df_england.loc[:, "gld_percentage"] = pd.to_numeric(df_england["gld_percentage"], errors="coerce")

# Group and calculate average GLD %
trend = df_england.groupby("year")["gld_percentage"].mean().reset_index()

# Plot
plt.figure(figsize=(8, 5))
plt.plot(trend["year"], trend["gld_percentage"], marker="o", linewidth=2, color="royalblue")

# Titles and labels
plt.title("📈 National GLD % Trend (England, 2021–24)", fontsize=14)
plt.xlabel("Academic Year")
plt.ylabel("Good Level of Development (%)")
plt.ylim(0, 100)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import zipfile
import os

# Set the path to your uploaded zip file (adjust if filename differs)
zip_path = "/content/Regions_December_2024_Boundaries_EN_BFC_-8742955026742668245.zip"
extract_path = "/content/ons_regions_shapefile"

# Unzip the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# List extracted files
os.listdir(extract_path)


In [None]:
import geopandas as gpd

# Load the shapefile (.shp) from the unzipped directory
gdf = gpd.read_file("/content/ons_regions_shapefile/RGN_DEC_24_EN_BFC.shp")  # Make sure the path matches your folder structure

# Preview to confirm
gdf.head()



In [None]:
region_scores = df_total.groupby("region_name")["gld_percentage"].mean().reset_index()
region_scores["gld_percentage"] = region_scores["gld_percentage"].astype(float)



In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt

gdf = gpd.read_file("/content/ons_regions_shapefile/RGN_DEC_24_EN_BFC.shp")  # Update if path differs

gdf = gdf.rename(columns={"RGN24NM": "region_name"})
map_df = gdf.merge(region_scores, on="region_name")


fig, ax = plt.subplots(1, 1, figsize=(10, 12))
map_df.plot(column="gld_percentage", ax=ax, legend=True, cmap="YlOrRd", edgecolor="black")
ax.set_title("FSM Attainment by Region (GLD %)\nPercentage of FSM-eligible children reaching a Good Level of Development", fontsize=14)
pad=20
ax.axis("off")
plt.show()






