# <u>UK Strandings data cleaning</u> 

## Exploratory Data Analysis (EDA) and cleaning

In [None]:
# Importing Python packages and modules
# !pip install contextily
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import numpy as np
import matplotlib.pyplot as plt
import folium
from shapely.geometry import Point, LineString, MultiLineString
from shapely.ops import unary_union
from IPython.display import FileLink
import calendar
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import calendar, math
import fiona
import os
import warnings
import dataframe_image as dfi
from branca.element import Template, MacroElement
import calendar
import contextily as ctx

## Importing the data
#### Strandings data from [Cetacean Strandings Investigation Program (CSIP)](https://ukstrandings.org/) 

In [None]:
# Strandings CSV (points) 
Strandings_df = gpd.read_file("UK_Ire_Merg.csv")

## Strandings data basic info

#### I did some basic cleaning, removing and renaming some columns so they matched in both datasets, and combined the UK and Irish data in Excel 

In [None]:
Strandings_df.head()

In [None]:
Strandings_df.describe()

#### Everything looks OK, although 79 species it to many so that will need cleaning

In [None]:
Strandings_df.info()

-----------------------------------------

#### Converting into a geodataframe

In [None]:
# Checking for non-numeric longitudes
print("Non-numeric longitude examples:")
print(Strandings_df.loc[pd.to_numeric(Strandings_df['Longitude'], errors='coerce').isna(), 'Longitude'].unique())
# Checking for non-numeric latitudes
print("Non-numeric latitude examples:")
print(Strandings_df.loc[pd.to_numeric(Strandings_df['Latitude'], errors='coerce').isna(), 'Latitude'].unique())

In [None]:
# Count non-numeric longitudes
non_numeric_lon = pd.to_numeric(Strandings_df['Longitude'], errors='coerce').isna()
print("Non-numeric longitude count:", non_numeric_lon.sum())

# Count non-numeric latitudes
non_numeric_lat = pd.to_numeric(Strandings_df['Latitude'], errors='coerce').isna()
print("Non-numeric latitude count:", non_numeric_lat.sum())


In [None]:
# Convert longitude and latitude into numeric floats
Strandings_df['Longitude'] = pd.to_numeric(Strandings_df['Longitude'], errors='coerce')
Strandings_df['Latitude'] = pd.to_numeric(Strandings_df['Latitude'], errors='coerce')

# Dropping any records with missing coordinates
Strandings_df = Strandings_df.dropna(subset=['Longitude', 'Latitude'])

# Creating a geometry column that is needed to convert DF to GDF
Strandings_df['geometry'] = [Point(xy) for xy in zip(Strandings_df['Longitude'], Strandings_df['Latitude'])]

# Setting the CRS to EPSG:4326 as its commonly used for mapping visulisations
Strandings_gdf = gpd.GeoDataFrame(Strandings_df, geometry='geometry', crs="EPSG:4326")

# Quicking results
print("CRS:", Strandings_gdf.crs)
Strandings_gdf.head()


In [None]:
Strandings_gdf.info()

#### Dropped 28 non-numeric lat longs from 27576 down to 27548

----------------------

## Data visualization and outliers

In [None]:
# Reprojecting to Web Mercator for basemap tiles
gdf_web = Strandings_gdf.to_crs(epsg=3857)

# Plotting with solid points
fig, ax = plt.subplots(figsize=(10, 10))
gdf_web.plot(
    ax=ax,
    color='black',
    markersize=10,
    alpha=0.7)

# Adding a basemap
ctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron)

# Cleaning up axes
ax.set_axis_off()
plt.title("Strandings Outliers", fontsize=15)

# Saving and displaying
plt.savefig(
    "Data_cleaning_images/Strandings_Cleaning_outliers_points_map.png",
    dpi=150, bbox_inches="tight")
plt.show()


#### There are some major outliers that are worth a closer look

In [None]:
# Creating a Folium map of UK
m = folium.Map(location=[53.079, -23.815], zoom_start=4, control_scale=True)

# Adding the strandings data with popups to get a closer look at outliers
for idx, row in Strandings_gdf.iterrows():
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=1,              
        color='black',       
        fill=True,
        fill_color='black', 
        fill_opacity=0.7,
        popup=f"ID: {row['ID']}<br>Local Authority: {row['Local Authority']}<br>Species: {row['Species']}"
    ).add_to(m)

# Title (fixed at top)
title_html = """
<div style="
 position: fixed; top: 10px; left: 50%; transform: translateX(-50%);
 z-index: 9999; background: white; padding: 6px 10px; border: 1px solid #999;
 border-radius: 6px; font-weight: 600;">
Original Strandings
</div>
"""
m.get_root().html.add_child(folium.Element(title_html))

# Displaying the map
m

In [None]:
# Downloading the image for use in the write-up
m.save("Data_cleaning_images/Strandings_Cleaning_Orginal_Strandings_Maps.html")

#### Link to orginal strandings data [Netlify map](https://orginal-strandings.netlify.app/) 

In [None]:
# Creating a Folium map showing a closer image of messy data points
m = folium.Map(location=[50.459, -4.662], zoom_start=8)

# Adding the strandings data with ID popups
for idx, row in Strandings_gdf.iterrows():
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=2,              
        color='black',       
        fill=True,
        fill_color='black', 
        fill_opacity=0.7,
        popup=f"ID: {row['ID']}<br>Local Authority: {row['Local Authority']}<br>Species: {row['Species']}"
    ).add_to(m)

# Displaying the map
m

--------

## _Removing outliers_

## Uploading a coastline geopackage to remove outliers and snap remaining points to

In [None]:
#Ireland boundary
ireland = gpd.read_file("Ireland_coastline.gpkg", layer="ADM_ADM_0")
print("Ireland CRS:", ireland.crs)

# Quick check
print("Ireland features:", len(ireland))
display(ireland.head())

# Visualising Ireland coastline
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
ireland.plot(ax=ax, edgecolor="black", facecolor="lightgray", linewidth=0.5)
ax.set_title("Ireland boundary", fontsize=16)
ax.set_axis_off()
plt.show()

In [None]:
# UK boundary
uk = gpd.read_file("UK_coastline.gpkg", layer="ADM_ADM_0")
print("UK CRS:", uk.crs)
#Quick check
print("UK features:", len(uk))
display(uk.head())

# Visualising UK coastline
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
uk.plot(ax=ax, edgecolor="black", facecolor="lightgray", linewidth=0.5)
ax.set_title("United Kingdom boundary", fontsize=16)
ax.set_axis_off()
plt.show()

In [None]:
# Combine the Ireland and UK coastlines
combined = gpd.GeoDataFrame(
    pd.concat([ireland, uk], ignore_index=True),
    crs=ireland.crs)
# Checking the CRS
print("Combined CRS:", combined.crs)

In [None]:
# Using dissolve to remove the Northern Ireland/Ireland inland boundary
outline = combined.dissolve()

In [None]:
# Visualising dissolved outline
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
outline.plot(ax=ax, edgecolor="black", facecolor="lightgray")
ax.set_title("Combined Ireland & UK (Dissolved)", fontsize=16)
ax.set_axis_off()
# Saving and displaying the map
plt.savefig("Data_cleaning_images/Strandings_Cleaning_Combined_Coastline.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
#Removing warnings for aesthetics 
warnings.filterwarnings(
    "ignore",
    message=".*unary_union.*deprecated.*")


# Saving as a geopackage into the main folder for use with the main notebooks
outline.to_file("../merged_outline.gpkg", layer="outline", driver="GPKG")
# Reprojecting to British National Grid (EPSG:27700) (bng) as its most accurate for UK
outline_bng = outline.to_crs("EPSG:27700")
# Building coastline boundary as MultiLineString
coastline_boundary = outline_bng.geometry.unary_union.boundary
#Checking the crs
print("Original strandings CRS:", Strandings_gdf.crs)

In [None]:
# Reproject strandings to EPSG:27700 for snapping
gdf_bng = Strandings_gdf.to_crs("EPSG:27700")
print("Strandings reprojected CRS:", gdf_bng.crs)

-------------------

##### Remove any strandings that are further from the coast than 5 km

In [None]:
# Simplifing the coastline boundary to reduce geometry complexity
simplified_boundary = coastline_boundary.simplify(tolerance=100) 

# Buffering the boundary
coast_buffer = simplified_boundary.buffer(5000) 

# Creating a GeoDataFrame for the buffer
coast_buffer_gdf = gpd.GeoDataFrame(geometry=[coast_buffer], crs="EPSG:27700")

# Using spatial join 
strandings_within_5km = gpd.sjoin(
    gdf_bng, 
    coast_buffer_gdf, 
    how="inner", 
    predicate="within")

# Report to see how many strandings are being dropped
original_count = len(gdf_bng)
filtered_count = len(strandings_within_5km)
print(f"Strandings before filtering: {original_count}")
print(f"Strandings within 5 km of coast: {filtered_count}")
print(f"Strandings dropped: {original_count - filtered_count}")



##### After some experimentation with different-sized buffers, 5km seems to be a good size for removing the worst outliers and only losing 649 points

---------------

## Visualising the removal of the outliers

In [None]:
# Use index difference to get dropped strandings
dropped_strandings = gdf_bng.loc[~gdf_bng.index.isin(strandings_within_5km.index)]


In [None]:
# Reprojecting all strandings (including dropped) and the buffer to WGS84 for mapping
strandings_within_5km_wgs = strandings_within_5km.to_crs(epsg=4326)
dropped_strandings_wgs = dropped_strandings.to_crs(epsg=4326)
coast_buffer_wgs = gpd.GeoDataFrame(geometry=[coast_buffer], crs="EPSG:27700").to_crs(epsg=4326)


In [None]:
# creating the base map with scale bar
m = folium.Map(location=[55, -3], zoom_start=5, control_scale=True)

# Feature groups so that the LayerControl acts like a legend
fg_within5 = folium.FeatureGroup(name="Strandings within 5 km", show=True)
fg_dropped = folium.FeatureGroup(name="Dropped strandings", show=True)
fg_buffer  = folium.FeatureGroup(name="5 km Coastal Buffer", show=True)

# Points: within 5 km buffer (green)
for _, row in strandings_within_5km_wgs.iterrows():
    folium.CircleMarker(
        location=[row.geometry.y, row.geometry.x],
        radius=2,
        color="green", weight=0.8,
        fill=True, fill_color="green", fill_opacity=0.6,
        popup=folium.Popup(
            f"Within 5km<br>ID: {row.get('ID', 'N/A')}",
            max_width=260),
    ).add_to(fg_within5)

# Dropped points (red)
for _, row in dropped_strandings_wgs.iterrows():
    folium.CircleMarker(
        location=[row.geometry.y, row.geometry.x],
        radius=2,
        color="red", weight=0.8,
        fill=True, fill_color="red", fill_opacity=0.6,
        popup=folium.Popup(
            f"Dropped<br>ID: {row.get('ID', 'N/A')}",
            max_width=260),
    ).add_to(fg_dropped)

# Buffer (blue)
folium.GeoJson(
    coast_buffer_wgs.geometry[0],
    name="5 km Coastal Buffer",
    style_function=lambda feat: {"color": "blue", "weight": 2, "fill": False},
    tooltip="5 km Coastal Buffer",
).add_to(fg_buffer)

# Adding the groups to map
fg_within5.add_to(m)
fg_dropped.add_to(m)
fg_buffer.add_to(m)

# Title for aesthetics 
title_html = """
<div style="
 position: fixed; top: 10px; left: 50%; transform: translateX(-50%);
 z-index: 9999; background: white; padding: 6px 10px; border: 1px solid #777;
 border-radius: 6px; font-weight: 600; box-shadow: 0 1px 3px rgba(0,0,0,0.2);">
Strandings within 5 km of Coastline
</div>
"""
m.get_root().html.add_child(folium.Element(title_html))

# Legend
legend_html = """
<div style="
 position: fixed; bottom: 60px; left: 30px; z-index:9999;
 background: white; padding: 10px; border: 1px solid #777; border-radius: 6px;
 box-shadow: 0 1px 3px rgba(0,0,0,.2); font-size: 14px;">
<b>Legend</b><br>
<span style="display:inline-block;width:12px;height:12px;background:green;border:1px solid #555;border-radius:6px;margin-right:6px;"></span>
Strandings within 5 km<br>
<span style="display:inline-block;width:12px;height:12px;background:red;border:1px solid #555;border-radius:6px;margin-right:6px;"></span>
Dropped strandings<br>
<span style="display:inline-block;width:12px;height:12px;background:transparent;border:2px solid blue;margin-right:6px;"></span>
5 km Coastal Buffer
</div>
"""
m.get_root().html.add_child(folium.Element(legend_html))

# Clickable layer control
folium.LayerControl(collapsed=False).add_to(m)

# Displaying
m  



In [None]:
# Downloading the image for use in the write-up
m.save("Data_cleaning_images/Strandings_Cleaning_Dropped_Strandings_Buffer.html")

#### Link to dropped strandings map [Netlify map](https://dropped-strandings-buffer.netlify.app/) 

-----------------------------

## _Snapping strandings to the coast_

In [None]:
# Keeping a back up
gdf_bng_all = gdf_bng.copy() 
# Outliers removed before snapping
gdf_bng = strandings_within_5km.copy()


In [None]:
# Defining the snapping function
def snap_to_coast_project(pt):
    distance_along = coastline_boundary.project(pt)
    return coastline_boundary.interpolate(distance_along)
    
# Applying snapping to strandings within the 5km buffer
gdf_bng["snapped_geom"] = gdf_bng.geometry.apply(snap_to_coast_project)

gdf_snapped_clean = gdf_bng[
    (~gdf_bng["snapped_geom"].is_empty) & (~gdf_bng["snapped_geom"].isna())
].copy()


# Converting to WGS84
gdf_snapped_wgs84 = gdf_snapped_clean.set_geometry("snapped_geom").to_crs("EPSG:4326")

# Build DataFrame
snapped_coords = pd.DataFrame({
    "snapped_latitude": gdf_snapped_wgs84.geometry.y,
    "snapped_longitude": gdf_snapped_wgs84.geometry.x,
}).reset_index(drop=True)

# Reseting original index
gdf_bng_reset = gdf_snapped_clean.reset_index(drop=True)

# Merging the snapped lat longs with the main dataframe
strandings_with_snapped = pd.concat(
    [gdf_bng_reset, snapped_coords], axis=1)

# Plotting to check
fig, ax = plt.subplots(1, 1, figsize=(7,7))
#Boundary in grey
gpd.GeoSeries([coastline_boundary], crs="EPSG:27700").plot(ax=ax, color="grey")

# 5km buffer strandings in blue
gdf_bng.plot(ax=ax, color="blue", markersize=8, label="Original")

# Snapped points in red
gpd.GeoSeries(gdf_bng["snapped_geom"], crs="EPSG:27700").plot(ax=ax, color="red", markersize=8, label="Snapped")
#Labels
ax.set_title("5km Buffer Strandings vs. Snapped Strandings (EPSG:27700)")
ax.set_axis_off()
ax.legend()
# Saving and displaying the map
plt.savefig("Data_cleaning_images/Strandings_Cleaning_5kmBuffervsSnapped.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
strandings_with_snapped.head()

In [None]:
# Checking no records have been lost in the process
strandings_with_snapped.info()

##### Strandings within 5 km of coast: 27064, so no records lost during snapping.

In [None]:
# Tidying put the dataframe
merged_strandings_with_snapped = gpd.GeoDataFrame(
    strandings_with_snapped.drop(columns=["Latitude", "Longitude", "geometry", "snapped_geom"]),
    geometry=gpd.points_from_xy(
        strandings_with_snapped["snapped_longitude"],
        strandings_with_snapped["snapped_latitude"]
    ),
    crs="EPSG:4326"
).reset_index(drop=True)


# Saving so I can upload the saved file rather than snapping each time I re-run the code
merged_strandings_with_snapped.to_file("../merged_strandings_with_snapped.gpkg", driver="GPKG")


In [None]:
merged_strandings_with_snapped.info()

In [None]:
merged_strandings_with_snapped.head()

In [None]:
# Uploading the saved snapped file so I don't have to rerun the code each time
strandings_with_snapped = gpd.read_file("../merged_strandings_with_snapped.gpkg")
#Checking all the data is there
print("CRS:", strandings_with_snapped.crs)
strandings_with_snapped.info()

In [None]:
# creating the base map with scale bar
m = folium.Map(location=[50.459, -4.662], zoom_start=8, control_scale=True)

# Feature groups so that the LayerControl acts like a legend
fg_original = folium.FeatureGroup(name="Original Strandings", show=True)
fg_within5  = folium.FeatureGroup(name="Strandings Within 5 km", show=True)
fg_snapped  = folium.FeatureGroup(name="Snapped Strandings", show=True)

# Original strandings (blue)
for _, row in Strandings_gdf.iterrows():
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=2, color="blue", weight=0.8,
        fill=True, fill_color="blue", fill_opacity=0.6,
        popup=folium.Popup(
            f"Original<br>ID: {row.get('ID','N/A')}<br>"
            f"Local Authority: {row.get('Local Authority','N/A')}<br>"
            f"Species: {row.get('Species','N/A')}",
            max_width=260),
    ).add_to(fg_original)

# strandings within 5 km (green)
for _, row in strandings_within_5km_wgs.iterrows():
    folium.CircleMarker(
        location=[row.geometry.y, row.geometry.x],
        radius=2, color="green", weight=0.8,
        fill=True, fill_color="green", fill_opacity=0.6,
        popup=folium.Popup(
            f"Within 5 km<br>ID: {row.get('ID','N/A')}<br>"
            f"Local Authority: {row.get('Local Authority','N/A')}<br>"
            f"Species: {row.get('Species','N/A')}",
            max_width=260),
    ).add_to(fg_within5)

# Snapped strandings (red)
for _, row in strandings_with_snapped.iterrows():
    folium.CircleMarker(
        location=[row['snapped_latitude'], row['snapped_longitude']],
        radius=2, color="red", weight=0.8,
        fill=True, fill_color="red", fill_opacity=0.6,
        popup=folium.Popup(
            f"Snapped<br>ID: {row.get('ID','N/A')}<br>"
            f"Local Authority: {row.get('Local Authority','N/A')}<br>"
            f"Species: {row.get('Species','N/A')}",
            max_width=260),
    ).add_to(fg_snapped)

# Adding the groups to the map
fg_original.add_to(m)
fg_within5.add_to(m)
fg_snapped.add_to(m)

# Title 
title_html = """
<div style="
 position: fixed; top: 10px; left: 50%; transform: translateX(-50%);
 z-index: 9999; background: white; padding: 6px 10px; border: 1px solid #777;
 border-radius: 6px; font-weight: 600; box-shadow: 0 1px 3px rgba(0,0,0,0.2);">
South West UK — Original, Within 5 km & Snapped Strandings
</div>
"""
m.get_root().html.add_child(folium.Element(title_html))

#Legend
legend_html = """
<div style="
 position: fixed; bottom: 60px; left: 30px; z-index:9999;
 background: white; padding: 10px; border: 1px solid #777; border-radius: 6px;
 box-shadow: 0 1px 3px rgba(0,0,0,.2); font-size: 14px;">
<b>Legend</b><br>
<span style="display:inline-block;width:12px;height:12px;background:blue;border:1px solid #555;border-radius:6px;margin-right:6px;"></span>
Original Strandings<br>
<span style="display:inline-block;width:12px;height:12px;background:green;border:1px solid #555;border-radius:6px;margin-right:6px;"></span>
Strandings Within 5 km<br>
<span style="display:inline-block;width:12px;height:12px;background:red;border:1px solid #555;border-radius:6px;margin-right:6px;"></span>
Snapped Strandings
</div>
"""
m.get_root().html.add_child(folium.Element(legend_html))

# Layer control
folium.LayerControl(collapsed=False).add_to(m)

# Display or save
m


In [None]:
#Saving the map
m.save("Data_cleaning_images/Strandings_Cleaning_SW_Snapped_vs_Original_Map.html")

#### Link to SW strandings vs snapped strandings [Netlify map](https://sw-cleaning-stages.netlify.app/) 

#### Checking the map it is possible to see the remaining strandings snapped effectivly to the coastline by comparing the pop ups

--------------------------------------------------------------------------------------

## Removing mass strandings

#### Removing mass strandings and leaving them as a single point

In [None]:
# Assessing the number of mass vs single strandings events
strandings_with_snapped['M/s'].value_counts()


In [None]:
# Filter only mass strandings
mass_strandings = strandings_with_snapped[strandings_with_snapped['M/s'].str.upper() == 'M']

# Group by date, species, and local authority, then count
mass_counts = (
    mass_strandings
    .groupby(['Date', 'Species', 'Local Authority'])
    .size()
    .reset_index(name='Count')
    .sort_values('Count', ascending=False))

mass_counts.head(10) # preview top 10

In [None]:
# Grouping by 'M/s'
ms_counts = strandings_with_snapped['M/s'].value_counts()

# Creating the bar chart and store the axes
plt.figure(figsize=(10, 6))
ax = ms_counts.plot(kind='bar', color='C0')
# Labels
plt.xlabel('Mass Stranding and Single stranding events (M/s)')
plt.ylabel('Total Occurrences')
plt.title('Total Occurrences per Mass Stranding and Single stranding events in UK and Ireland')
plt.xticks(rotation=0)

# Adding the number labels on top of each bar
for bar in ax.patches:
    height = bar.get_height()
    ax.text(
        bar.get_x() + bar.get_width() / 2,
        height + 1,  # Slightly above the bar
        str(int(height)),
        ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig("Data_cleaning_images/Strandings_Cleaning_Mass_original_numbers.png", dpi=150, bbox_inches="tight")
plt.show()


In [None]:
# Filtering out the mass strandings
mass_strandings = strandings_with_snapped[strandings_with_snapped['M/s'] == 'M']
# Grouping by lat/lon
grouped = mass_strandings.groupby(['snapped_latitude', 'snapped_longitude'])

# Base map plus scale
m = folium.Map(location=[55.61, -2.85], zoom_start=5, control_scale=True)

# FeatureGroup for layercontrol
fg_mass = folium.FeatureGroup(name="Mass strandings (grouped by location)", show=True)

# Adding a marker per location, with ID, confirmation its a mass stranding, species and LA for all the entries in the pop-up
for (lat, lon), group in grouped:
    popup_entries = []
    for _, row in group.iterrows():
        popup_entries.append(
            f"<b>ID:</b> {row.get('ID','N/A')}<br>"
            f"<b>M/s:</b> {row.get('M/s','N/A')}<br>"
            f"<b>Species:</b> {row.get('Species','N/A')}<br>"
            f"<b>Local Authority:</b> {row.get('Local Authority','N/A')}<br><br>")
    popup_text = ''.join(popup_entries)

    folium.CircleMarker(
        location=[lat, lon],
        radius=2,
        color='purple', weight=0.8,
        fill=True, fill_color='purple', fill_opacity=0.7,
        popup=folium.Popup(popup_text, max_width=320, min_width=160),
    ).add_to(fg_mass)

fg_mass.add_to(m)

# Title
title_html = """
<div style="
 position: fixed; top: 10px; left: 50%; transform: translateX(-50%);
 z-index: 9999; background: white; padding: 6px 10px; border: 1px solid #777;
 border-radius: 6px; font-weight: 600; box-shadow: 0 1px 3px rgba(0,0,0,0.2);">
 Mass Strandings (grouped by snapped coordinates)
</div>
"""
m.get_root().html.add_child(folium.Element(title_html))

#Legend 
legend_html = """
<div style="
 position: fixed; bottom: 60px; left: 30px; z-index:9999;
 background: white; padding: 10px; border: 1px solid #777; border-radius: 6px;
 box-shadow: 0 1px 3px rgba(0,0,0,.2); font-size: 14px;">
<b>Legend</b><br>
<span style="display:inline-block;width:12px;height:12px;background:purple;
       border:1px solid #555;border-radius:6px;margin-right:6px;"></span>
Mass stranding location<br>
<small>Each marker may represent multiple records at the same lat/lon; see popup.</small>
</div>
"""
m.get_root().html.add_child(folium.Element(legend_html))

# Layer control 
folium.LayerControl(collapsed=False).add_to(m)

# Displayin gthe map
m



In [None]:
m.save("Data_cleaning_images/Strandings_Cleaning_UK_Ireland_Single_Multiple_Mass_Map.html")

#### Link to multiple strandings [Netlify map](https://mass-strandings-multiple.netlify.app/)

#### Examining the map shows a few single records that are logged as mass, they  seem to have the same species strandings nearby on the same day, and sequential ID numbers.

In [None]:
# Removing mass strandings

# Identifing which records are mass strandings
mass_mask = strandings_with_snapped['M/s'] == 'M'

# Listing the columns that identify a unique stranding (not including the index)
cols_to_check = [
    'Species','Date']

# Counting the number of mass strandings before deduplication
initial_mass_count = mass_mask.sum()

# Removing duplicates from the mass strandings
unique_mass_strandings = strandings_with_snapped[mass_mask].drop_duplicates(subset=cols_to_check)
unique_mass_count = len(unique_mass_strandings)

# Calculating the number of records removed
removed_mass_count = initial_mass_count - unique_mass_count

# Combining single strandings with deduplicated mass strandings
strandings_with_snapped = pd.concat([
    strandings_with_snapped[~mass_mask],
    unique_mass_strandings])

# Reseting the index
strandings_with_snapped = strandings_with_snapped.sort_index().reset_index(drop=True)

# Printing out a breakdown to assess if the correct number were removed
print(f"Initial mass strandings: {initial_mass_count}")
print(f"Removed duplicate mass strandings: {removed_mass_count}")
print(f"Remaining unique mass strandings: {unique_mass_count}")


In [None]:
#Running the same bar graph, shows a single stranding for each pop-up

# Grouping by 'M/s'
ms_counts = strandings_with_snapped['M/s'].value_counts()

# Creating the bar chart and store the axes
plt.figure(figsize=(10, 6))
ax = ms_counts.plot(kind='bar', color='C0')
# Labels
plt.xlabel('Mass Stranding and Single stranding events (M/s)')
plt.ylabel('Total Occurrences')
plt.title('Total Occurrences per Mass Stranding and Single stranding events in UK and Ireland')
plt.xticks(rotation=0)

# Adding the number labels on top of each bar
for bar in ax.patches:
    height = bar.get_height()
    ax.text(
        bar.get_x() + bar.get_width() / 2,
        height + 1,  # Slightly above the bar
        str(int(height)),
        ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig("Data_cleaning_images/Strandings_Cleaning_Mass_unique_Numbers.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# Checking for any missing data hicups 
strandings_with_snapped.info()

In [None]:
#Running the same map, shows a single stranding for each pop-up

# Filtering out the mass strandings
mass_strandings = strandings_with_snapped[strandings_with_snapped['M/s'] == 'M']

# Grouping by lat/lon
grouped = mass_strandings.groupby(['snapped_latitude', 'snapped_longitude'])

#Creating the map
m = folium.Map(location=[55.61, -2.85], zoom_start=5)

# Adding a marker per location, with ID, confirmation its a mass stranding, species and LA for all the entries in the pop-up
for (lat, lon), group in grouped:
    popup_entries = []
    for _, row in group.iterrows():
        entry = (
            f"<b>ID:</b> {row['ID']}<br>"
            f"<b>M/s:</b> {row['M/s']}<br>"
            f"<b>Species:</b> {row['Species']}<br>"
            f"<b>Local Authority:</b> {row['Local Authority']}<br><br>")
        popup_entries.append(entry)
    
    popup_text = ''.join(popup_entries)
    
    folium.CircleMarker(
        location=[lat, lon],
        radius=2,
        color='purple',
        fill=True,
        fill_color='purple',
        fill_opacity=0.7,
        popup=folium.Popup(popup_text, max_width=300, min_width=150)
    ).add_to(m)

# Title
title_html = """
<div style="
 position: fixed; top: 10px; left: 50%; transform: translateX(-50%);
 z-index: 9999; background: white; padding: 6px 10px; border: 1px solid #777;
 border-radius: 6px; font-weight: 600; box-shadow: 0 1px 3px rgba(0,0,0,0.2);">
Unique Mass Strandings
</div>
"""
m.get_root().html.add_child(folium.Element(title_html))

#Legend 
legend_html = """
<div style="
 position: fixed; bottom: 60px; left: 30px; z-index:9999;
 background: white; padding: 10px; border: 1px solid #777; border-radius: 6px;
 box-shadow: 0 1px 3px rgba(0,0,0,.2); font-size: 14px;">
<b>Legend</b><br>
<span style="display:inline-block;width:12px;height:12px;background:purple;
       border:1px solid #555;border-radius:6px;margin-right:6px;"></span>
Mass stranding location<br>
<small>Each marker may represent multiple records at the same lat/lon; see popup.</small>
</div>
"""
m.get_root().html.add_child(folium.Element(legend_html))

# Displaying the map
m


In [None]:
m.save("Data_cleaning_images/Strandings_Cleaning_UK_Ireland_Single_Multiple_Unique_Map.html")

#### Link to multiple strandings [Netlify map](https://mass-strandings-multiple.netlify.app/) !!!NEEDS UPDATING!!!

--------------------------------

# Strandings Explorative Data Analysis

#### Cleaning the dataframe

#### Cleaning species name

In [None]:
# Looking at the list of unique speices names
unique_species = strandings_with_snapped['Species'].unique()
print(unique_species)

#### There are a lot of duplication of species that needs cleaning up

In [None]:
species_mapping = {
    # Unidentified whale
    'large whale species': 'Unidentified whale',
    'medium whale species': 'Unidentified whale',
    'beaked whale species': 'Unidentified whale',
    'sei fin or blue whale': 'Unidentified whale',
    'whale species': 'Unidentified whale',
    'Unknown odontocete': 'Unidentified whale',
    'Unknown balaenopterid': 'Unidentified whale',
    'pilot/false killer whale': 'Unidentified whale',
    'Unknown mysticete': 'Unidentified whale',
    'Unknown ziphiid': 'Unidentified whale',
    'Odontocete (indeterminate species)': 'Unidentified whale',
    'Baleen whale (indeterminate species)': 'Unidentified whale',
    'Mysticete (indeterminate species)': 'Unidentified whale',
    'Beaked whale (indeterminate species)': 'Unidentified whale',

    # Unidentified dolphin
    'common or striped dolphin': 'Unidentified dolphin',
    'dolphin species': 'Unidentified dolphin',
    'dolphin species possibly harbour porpoise': 'Unidentified dolphin',
    'lagenorhynchus species': 'Unidentified dolphin',
    'Unknown delphinid': 'Unidentified dolphin',
    'Short-beaked common dolphin/striped dolphin (indeterminate species)': 'Unidentified dolphin',
    'Dolphin (indeterminate species)': 'Unidentified dolphin',
    'Lagenorhynchus sp. (indeterminate species)': 'Unidentified dolphin',

    # Unidentified species
    'cetacean species': 'Unidentified species',
    'NA': 'Unidentified species',
    'Cetacean (indeterminate species)': 'Unidentified species',

    # Pilot Whale
    'pilot whale': 'Pilot Whale',
    'Long-finned pilot whale': 'Pilot Whale',
    'Short-finned pilot whale': 'Pilot Whale',

    # Common Dolphin
    'common dolphin': 'Common Dolphin',
    'Common dolphin': 'Common Dolphin',
    'Short-beaked common dolphin': 'Common Dolphin',

    # Harbour Porpoise
    'harbour porpoise': 'Harbour Porpoise',
    'Harbour porpoise': 'Harbour Porpoise',

    # Striped Dolphin
    'striped dolphin': 'Striped Dolphin',
    'Striped dolphin': 'Striped Dolphin',

    # Minke Whale
    'minke whale': 'Minke Whale',
    'Minke whale': 'Minke Whale',

    # Bottlenose Dolphin
    'bottlenose dolphin': 'Bottlenose Dolphin',
    'Bottlenose dolphin': 'Bottlenose Dolphin',
    'Bottlenose whale': 'Bottlenose Dolphin',

    # Risso’s Dolphin
    "Risso's dolphin": "Risso’s Dolphin",

    # Humpback Whale
    'humpback whale': 'Humpback Whale',
    'Humpback whale': 'Humpback Whale',

    # True’s Beaked Whale
    "True's beaked whale": "True’s Beaked Whale",
    "True's Beaked whale": "True’s Beaked Whale",

    # Sperm Whale
    'sperm whale': 'Sperm Whale',
    'Sperm whale': 'Sperm Whale',

    # Cuvier’s Beaked Whale
    "Cuvier's beaked whale": "Cuvier’s Beaked Whale",
    "Cuvier's Beaked whale": "Cuvier’s Beaked Whale",

    # Fin Whale
    'fin whale': 'Fin Whale',
    'Fin whale': 'Fin Whale',

    # Northern Bottlenose Whale
    'northern bottlenose whale': 'Northern Bottlenose Whale',
    'Northern bottlenose whale': 'Northern Bottlenose Whale',

    # White-Beaked Whale
    'white-beaked dolphin': 'White-Beaked Whale',
    'White-sided dolphin': 'White-Beaked Whale',
    'White-beaked dolphin': 'White-Beaked Whale',

    # Sowerby’s Beaked Whale
    "Sowerby's beaked whale": "Sowerby’s Beaked Whale",
    "Sowerby's Beaked whale": "Sowerby’s Beaked Whale",

    # Sei Whale
    'sei whale': 'Sei Whale',
    'Sei whale': 'Sei Whale',

    # Pygmy Sperm Whale
    'pygmy sperm whale': 'Pygmy Sperm Whale',
    'Pygmy Sperm whale': 'Pygmy Sperm Whale',
    'Pygmy sperm whale': 'Pygmy Sperm Whale',

    # Atlantic White-Sided Dolphin
    'Atlantic white-sided dolphin': 'Atlantic White-Sided Dolphin',

    # Killer Whale
    'killer whale': 'Killer Whale',
    'Killer whale': 'Killer Whale',

    # Gervais Beaked Whale
    'Gervais beaked whale': 'Gervais Beaked Whale',
    "Gervais' Beaked whale": 'Gervais Beaked Whale',

    # False Killer Whale
    'false killer whale': 'False Killer Whale',
    'False killer whale': 'False Killer Whale',

    # Narwal
    'Narwal': 'Narwal',

    # Blue Whale
    'Blue whale': 'Blue Whale',

    # Beluga Whale
    'Beluga whale': 'Beluga Whale',

    # Melon-Headed Whale
    'Melon-headed whale': 'Melon-Headed Whale',

    # Blainville’s Beaked Whale
    "Blainville's beaked whale": "Blainville’s Beaked Whale",

    # Frasers Dolphin
    "Fraser's dolphin": "Frasers Dolphin",

    # Dwarf Sperm Whale
    'Dwarf sperm whale': 'Dwarf Sperm Whale'}


In [None]:
# Replacing species with the cleaned names
strandings_with_snapped['Species_cleaned'] = strandings_with_snapped['Species'].replace(species_mapping)

In [None]:
#Checking all the species were mapped correctly
remaining_species = strandings_with_snapped.loc[
    strandings_with_snapped['Species_cleaned'].isna(), 'Species'
].unique()

print("Unmapped species:", remaining_species)

In [None]:
# Dropping the old 'Species' column
strandings_with_snapped = strandings_with_snapped.drop('Species', axis=1)

# Renaming 'Species_cleaned' to 'Species'
strandings_with_snapped = strandings_with_snapped.rename(columns={'Species_cleaned': 'Species'})

In [None]:
# Checking my unique species after renaming
unique_species = strandings_with_snapped['Species'].unique()
print(unique_species)

#### Seperating the date column and removing the time

In [None]:
strandings_with_snapped.head()

In [None]:
# Make sure Date is a string
strandings_with_snapped["Date"] = strandings_with_snapped["Date"].astype(str)

# Remove " 0:00" and " 00:00" time from them
strandings_with_snapped["Date"] = strandings_with_snapped["Date"].str.replace(r"\s0*:0*0$", "", regex=True)

In [None]:
# Converting to datetime
strandings_with_snapped['Date_parsed'] = pd.to_datetime(strandings_with_snapped['Date'], format='%d/%m/%Y', errors='coerce')

# Extracting day/month/year as integers
strandings_with_snapped['Year'] = strandings_with_snapped['Date_parsed'].dt.year.astype('Int64')
strandings_with_snapped['Month'] = strandings_with_snapped['Date_parsed'].dt.month.astype('Int64')
strandings_with_snapped['Day'] = strandings_with_snapped['Date_parsed'].dt.day.astype('Int64')


In [None]:
#Checking nulls
strandings_with_snapped.info()

#### Why did I lose some records

In [None]:
#Looking at the Date column for those that didn't get seperated
bad = strandings_with_snapped[strandings_with_snapped['Date_parsed'].isna()]['Date']
bad.sample(20, random_state=0)
bad.value_counts().head(20) 


#### This shouldn't affect anything going forward as I am only using date for basic EDA and not in the analysis, so I'm going to leave them in favour of keeping the lat long data.

In [None]:
#Checking split dates match the original dates
strandings_with_snapped.sample(5)

In [None]:
# Remaming snapped_latitude and	snapped_longitude, to latitude and longitude
strandings_with_snapped = strandings_with_snapped.rename(
    columns={
        'snapped_latitude': 'latitude',
        'snapped_longitude': 'longitude'})

------------------------------

## Broken down by year

In [None]:
# Grouping by year and count the occurrences
yearly_counts = strandings_with_snapped['Year'].value_counts().sort_index()

# Create the bar chart
plt.figure(figsize=(12, 6))
plt.bar(yearly_counts.index, yearly_counts.values)
plt.xlabel('Year')
plt.ylabel('Total Occurrences')
plt.title('Total Occurrences Each Year in UK and Ireland')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig("Data_cleaning_images/Strandings_Cleaning_Yearly_Strandings.png", dpi=150, bbox_inches="tight")
plt.show()

------------------------------

## Broken down by month

In [None]:
# Grouping by month
monthly_counts = strandings_with_snapped['Month'].value_counts().sort_index()

# Creating the bar chart, labels and changing the numbered months to words
plt.figure(figsize=(12, 6))
bars = plt.bar(
    monthly_counts.index,
    monthly_counts.values,
    color='C0',          # <- same blue as the other chart (#1f77b4)
    width=0.8)
plt.xlabel('Month')
plt.ylabel('Total Occurrences')
plt.title('Total Occurrences Each Month in UK and Ireland')
plt.xticks(range(1, 13), labels=[
    'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
    'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])

# Adding number labels on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height + 1, 
        str(int(height)),
        ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig("Data_cleaning_images/Strandings_Cleaning_Monthly_Barchart.png", dpi=150, bbox_inches="tight")
plt.show()


In [None]:
species_month = strandings_with_snapped['Month'].value_counts().sort_index()

# Map month numbers to short names
month_names = [calendar.month_abbr[m] for m in species_month.index]

# Function to only show % if slice > 2%
def autopct_func(pct):
    return '{:.1f}%'.format(pct) if pct > 2 else ''

# Plot pie chart
plt.figure(figsize=(10, 10))
wedges, texts, autotexts = plt.pie(
    species_month,
    labels=None,
    autopct=autopct_func,
    startangle=140,
    wedgeprops={'edgecolor': 'black'}
)

# Add legend with month names
plt.legend(
    wedges,
    month_names,
    title='Month',
    bbox_to_anchor=(1, 0.5),
    loc='center left',
    fontsize='small'
)

plt.title('Proportion of Strandings by Month')
plt.axis('equal')  # Equal aspect ratio
plt.tight_layout()
plt.savefig("Data_cleaning_images/Strandings_Cleaning_Month_Piechart.png", dpi=150, bbox_inches="tight")
plt.show()


In [None]:
warnings.filterwarnings("ignore", message=".*tight_layout.*")

# ---- inputs ----
species_of_interest = [
    "Pilot Whale", "Common Dolphin", "Harbour Porpoise", "Striped Dolphin",
    "Minke Whale", "Bottlenose Dolphin", "Risso’s Dolphin", "Sperm Whale",
    "Cuvier’s Beaked Whale", "Northern Bottlenose Whale", "White-Beaked Whale",
    "Atlantic White-Sided Dolphin"
]

# Geo data
coastline = gpd.read_file("../merged_outline.gpkg").to_crs(3857)
gdf = strandings_with_snapped.copy()
gdf['Species'] = gdf['Species'].astype(str).str.strip()
gdf = gdf[gdf['Species'].isin(species_of_interest)].copy()
gdf = gpd.GeoDataFrame(
    gdf,
    geometry=gpd.points_from_xy(gdf['longitude'], gdf['latitude']),  # <-- lon, lat
    crs="EPSG:4326"
).to_crs(3857)

# helper: monthly counts with zeros for all 12 months
all_months = pd.Index(range(1, 13), name='Month')
month_names = [calendar.month_abbr[m] for m in all_months]
def monthly_counts_for(spp):
    return gdf.loc[gdf['Species'] == spp, 'Month'].value_counts().reindex(all_months, fill_value=0)

counts_dict = {s: monthly_counts_for(s) for s in species_of_interest}
totals = {s: int(c.sum()) for s, c in counts_dict.items()}

# grid sizing
cols = 4
rows = math.ceil(len(species_of_interest) / cols)
fig = plt.figure(figsize=(cols*3.4, rows*5.0))
gs = gridspec.GridSpec(rows*2, cols, figure=fig, hspace=0.35, wspace=0.25)

# keep map extent consistent across panels
xmin, ymin, xmax, ymax = coastline.total_bounds

for i, spp in enumerate(species_of_interest):
    r, c = divmod(i, cols)

    # ---------- top: bar chart ----------
    ax_bar = fig.add_subplot(gs[2*r, c])
    counts = counts_dict[spp]
    x = np.arange(1, 13)
    bars = ax_bar.bar(x, counts.values, color='steelblue', width=0.8)

    m = counts.max()
    thresh = max(5, 0.2*m)  # annotate only meaningful bars
    for b in bars:
        h = b.get_height()
        if h >= thresh:
            ax_bar.text(b.get_x()+b.get_width()/2, h + 0.03*max(1, m), str(int(h)),
                        ha='center', va='bottom', fontsize=8)

    ax_bar.set_title(f"{spp}  (n={totals[spp]})", fontsize=10)
    ax_bar.set_xticks(x, month_names, fontsize=6)
    ax_bar.tick_params(axis='y', labelsize=8)
    ax_bar.set_ylim(0, m*1.25 if m > 0 else 1)
    ax_bar.grid(axis='y', linewidth=0.4, alpha=0.4)
    ax_bar.spines['top'].set_visible(False)
    ax_bar.spines['right'].set_visible(False)

    # ---------- bottom: map ----------
    ax_map = fig.add_subplot(gs[2*r+1, c])
    coastline.plot(ax=ax_map, color='lightgrey', edgecolor='black', linewidth=0.5)
    spp_pts = gdf[gdf['Species'] == spp]
    if not spp_pts.empty:
        spp_pts.plot(ax=ax_map, color='red', markersize=4, alpha=0.9)

    ax_map.set_xlim(xmin, xmax)
    ax_map.set_ylim(ymin, ymax)
    ax_map.axis('off')

# hide any unused cells (if species count not multiple of cols)
for j in range(i+1, rows*cols):
    r, c = divmod(j, cols)
    fig.add_subplot(gs[2*r, c]).axis('off')
    fig.add_subplot(gs[2*r+1, c]).axis('off')

fig.suptitle("Monthly strandings (top) and locations (bottom) by species", y=0.995, fontsize=12)
fig.tight_layout()
fig.savefig("Data_cleaning_images/Strandings_Cleaning_Species_Monthly_Barcharts_Plus_Maps.png", dpi=200, bbox_inches="tight")
plt.show()


In [None]:
# Creating clean months by name
df = strandings_with_snapped.copy()
df["MonthNum"] = pd.to_numeric(df["Month"], errors="coerce").astype("Int64")
df = df[df["MonthNum"].between(1, 12)]
df["MonthName"] = df["MonthNum"].map(lambda i: calendar.month_name[int(i)])

# Months in the correct order
month_names = [calendar.month_name[i] for i in range(1, 13) if (df["MonthNum"] == i).any()]

# Base map plus scale bar
m = folium.Map(location=[55.61, -2.85], zoom_start=5, control_scale=True)

# Colors per month
cmap = plt.colormaps["tab20"].resampled(len(month_names))
month_colors = {name: mcolors.to_hex(cmap(i)) for i, name in enumerate(month_names)}

# Plotting each month
for name in month_names:
    fg = folium.FeatureGroup(name=name, show=True)
    subset = df[df["MonthName"] == name]
    for _, row in subset.iterrows():
        folium.CircleMarker(
            location=[row["latitude"], row["longitude"]],
            radius=2,
            color=month_colors[name], weight=0.8,
            fill=True, fill_color=month_colors[name], fill_opacity=0.6,
            popup=folium.Popup(f"Month: {name}", max_width=200),
        ).add_to(fg)
    fg.add_to(m)

# Title
m.get_root().html.add_child(folium.Element("""
<div style="position: fixed; top: 10px; left: 50%; transform: translateX(-50%);
 z-index: 9999; background: white; padding: 6px 10px; border: 1px solid #777;
 border-radius: 6px; font-weight: 600; box-shadow: 0 1px 3px rgba(0,0,0,.2);">
 Strandings by Month (snapped points)
</div>
"""))

# Legend
legend_items = "".join(
    f'<div><span style="display:inline-block;width:12px;height:12px;'
    f'background:{month_colors[name]};border:1px solid #555;margin-right:6px;"></span>{name}</div>'
    for name in month_names
)
m.get_root().html.add_child(folium.Element(f"""
<div style="position: fixed; bottom: 60px; left: 30px; z-index:9999;
 background: white; padding: 10px; border: 1px solid #777; border-radius: 6px;
 box-shadow: 0 1px 3px rgba(0,0,0,.2); font-size: 14px; max-height: 240px; overflow:auto;">
<b>Legend — Month</b>{legend_items}
</div>
"""))


folium.LayerControl(collapsed=False).add_to(m)
m




In [None]:
m.save("Data_cleaning_images/Strandings_Cleaning_UK_Ireland_By_Month.html")

#### Link to monthly strandings [Netlify map](https://uk-ireland-by-month.netlify.app/)

#### Speices 

In [None]:
# Group by species
species_counts = strandings_with_snapped['Species'].value_counts()

# Create the bar chart
plt.figure(figsize=(15, 8))
bars = plt.bar(species_counts.index, species_counts.values, width=0.8, color='steelblue')

# Add axis labels & title
plt.xlabel('Species')
plt.ylabel('Total Count')
plt.title('Total Count per Species in UK and Ireland')

# Rotate x-axis labels
plt.xticks(rotation=90)


# Layout tweaks
plt.tight_layout()

# Save and show
plt.savefig("Data_cleaning_images/Strandings_Cleaning_Species_Barchart_Annotated.png",
            dpi=150, bbox_inches="tight")
plt.show()


In [None]:
# Counts by species
species_counts = strandings_with_snapped['Species'].value_counts()

# Top 12 + "Other"
top_n = 12
top_counts = species_counts.iloc[:top_n]
other_count = species_counts.iloc[top_n:].sum()
sizes = top_counts.tolist() + ([other_count] if other_count > 0 else [])
names = top_counts.index.tolist() + (["Other"] if other_count > 0 else [])

# Percentages for legend
total = sum(sizes) if sum(sizes) > 0 else 1
pcts = [100 * v / total for v in sizes]
legend_labels = [f"{n} — {p:.1f}%" for n, p in zip(names, pcts)]

# Plot pie (no autopct, no labels on slices)
fig, ax = plt.subplots(figsize=(10, 10))
wedges, _ = ax.pie(
    sizes,
    labels=None,        
    startangle=140,
    wedgeprops={'edgecolor': 'black'})

# Legend with percentages for the first 12 + Other
ax.legend(
    wedges,
    legend_labels,
    title="Species",
    loc="center left",
    bbox_to_anchor=(1, 0.5),
    fontsize='large')

ax.set_title("Proportion of Strandings by Species")
ax.axis('equal')
plt.tight_layout()

plt.savefig("Data_cleaning_images/Strandings_Cleaning_Species_Pie_Top12_Other.png",
            dpi=150, bbox_inches="tight")
plt.show()


In [None]:
# Clean species names
df = strandings_with_snapped.copy()
df["SpeciesName"] = (
    df["Species"]
    .astype(str)
    .str.strip()
    .replace({"": pd.NA}))

# Unique species, aphabetical
species_names = sorted(df["SpeciesName"].dropna().unique().tolist())

#Base map + scale bar 
m = folium.Map(location=[55.61, -2.85], zoom_start=5, control_scale=True)

# Colors per species
cmap = plt.colormaps["tab20"].resampled(len(species_names))
species_colors = {name: mcolors.to_hex(cmap(i)) for i, name in enumerate(species_names)}

# Plotting each species as its own layer 
for name in species_names:
    fg = folium.FeatureGroup(name=name, show=True)
    subset = df[df["SpeciesName"] == name]
    for _, row in subset.iterrows():
        folium.CircleMarker(
            location=[row["latitude"], row["longitude"]],
            radius=4,
            color=species_colors[name], weight=0.8,
            fill=True, fill_color=species_colors[name], fill_opacity=0.6,
            popup=folium.Popup(f"Species: {name}", max_width=220),
        ).add_to(fg)
    fg.add_to(m)

# Title
m.get_root().html.add_child(folium.Element("""
<div style="position: fixed; top: 10px; left: 50%; transform: translateX(-50%);
 z-index: 9999; background: white; padding: 6px 10px; border: 1px solid #777;
 border-radius: 6px; font-weight: 600; box-shadow: 0 1px 3px rgba(0,0,0,.2);">
 Strandings by Species (snapped points)
</div>
"""))

# Legend
legend_items = "".join(
    f'<div><span style="display:inline-block;width:12px;height:12px;'
    f'background:{species_colors[name]};border:1px solid #555;margin-right:6px;"></span>{name}</div>'
    for name in species_names)

m.get_root().html.add_child(folium.Element(f"""
<div style="position: fixed; bottom: 60px; left: 30px; z-index:9999;
 background: white; padding: 10px; border: 1px solid #777; border-radius: 6px;
 box-shadow: 0 1px 3px rgba(0,0,0,.2); font-size: 14px; max-height: 260px; overflow:auto;">
<b>Legend — Species</b>
{legend_items}
</div>
"""))

#Layer control
folium.LayerControl(collapsed=False).add_to(m)

m


In [None]:
m.save("Data_cleaning_images/Strandings_Cleaning_Species_Map.html")

#### Link to species strandings [Netlify map](https://uk-ireland-species.netlify.app/)

In [None]:
# Load coastline
coastline = gpd.read_file("../merged_outline.gpkg")

# Reproject both to same CRS (Web Mercator for clean plotting)
gdf = strandings_with_snapped.to_crs(epsg=3857)
coastline = coastline.to_crs(epsg=3857)

# Grid layout setup
species_list = gdf['Species'].dropna().unique()
n_species = len(species_list)
cols = 4
rows = math.ceil(n_species / cols)
fig, axes = plt.subplots(rows, cols, figsize=(cols * 4, rows * 4))
axes = axes.flatten()

# Plot loop
for i, species in enumerate(species_list):
    ax = axes[i]
    subset = gdf[gdf['Species'] == species]
    
    # Plot UK coastline
    coastline.plot(ax=ax, color='lightgrey', edgecolor='black')
    
    # Plot strandings
    subset.plot(ax=ax, color='purple', markersize=10)
    
    # Title with species name and count
    count = len(subset)
    ax.set_title(f"{species} ({count})")
    ax.axis('off')


# Turn off unused plots
for j in range(i + 1, len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.savefig("Data_cleaning_images/Strandings_Cleaning_Species_Maps.png", dpi=150, bbox_inches="tight")
plt.show()




#### Cleaning columns

In [None]:
# Remaming snapped_latitude and	snapped_longitude, to latitude and longitude
strandings_with_snapped = strandings_with_snapped.rename(
    columns={
        'snapped_latitude': 'latitude',
        'snapped_longitude': 'longitude'})

# Checking the changes
strandings_with_snapped.info()
strandings_with_snapped.head()

In [None]:
#creating a new df
clean_strandings = strandings_with_snapped.copy()

# Saving to CSV
clean_strandings.to_csv('../clean_strandings.csv', index=False)

# Display download link
FileLink('../clean_strandings.csv')

-------------

# Creating Random Points

In [None]:
layers = fiona.listlayers("../merged_outline.gpkg")


In [None]:
warnings.filterwarnings(
    "ignore",
    message=".*unary_union.*deprecated.*")

#from shapely.ops import unary_union
# Set a seed for reproducibility!
np.random.seed(42)

#Using the coastline geopackage
gadm = gpd.read_file(
    "../merged_outline.gpkg",
#    layer="ADM_ADM_0"
).to_crs('EPSG:4326')

# Getting all boundaries as a MultiLineString
all_boundaries = gadm.geometry.boundary.unary_union 

# Preparing a list to collect all the points
all_points = []

#Setting the number of points to match the number of clean strandings
N = len(clean_strandings)

# Distribute points proportional along the length of the boundary
if isinstance(all_boundaries, MultiLineString):
    lengths = np.array([line.length for line in all_boundaries.geoms])
    n_per = np.round(N * lengths / lengths.sum()).astype(int)
    n_per[-1] += N - n_per.sum()

    for line, n in zip(all_boundaries.geoms, n_per):
        if n < 1:
            continue
        distances = np.sort(np.random.uniform(0, line.length, n))
        pts = [line.interpolate(distance) for distance in distances]
        all_points.extend(pts)
else:
    line = all_boundaries
    distances = np.sort(np.random.uniform(0, line.length, N))
    all_points = [line.interpolate(distance) for distance in distances]

random_points_gdf = gpd.GeoDataFrame(geometry=all_points, crs="EPSG:4326")

In [None]:
#Visulising the random points 
m = folium.Map(location=[55.61, -2.85], zoom_start=5)
for pt in random_points_gdf.geometry:
    folium.CircleMarker(
        location=[pt.y, pt.x],
        radius=2, color='red', fill=True, fill_color='red'
    ).add_to(m)
    
# Title 
m.get_root().html.add_child(folium.Element("""
<div style="position: fixed; top: 10px; left: 50%; transform: translateX(-50%);
 z-index: 9999; background: white; padding: 6px 10px; border: 1px solid #777;
 border-radius: 6px; font-weight: 600; box-shadow: 0 1px 3px rgba(0,0,0,.2);">
 Random Points
</div>
"""))

m

In [None]:
m.save("Data_cleaning_images/Strandings_Cleaning_Random_Points.html")

#### Link to species strandings [Netlify map](https://uk-random.netlify.app/) 

In [None]:
random_points_gdf.info()
random_points_gdf.head()

In [None]:
print("EPSG:", random_points_gdf.crs.to_epsg())


In [None]:
#creating a new df
random_strandings = random_points_gdf.copy()

# Saving to CSV
random_strandings.to_csv('../random_strandings.csv', index=False)

# Display download link
FileLink('../random_strandings.csv')