The dataset must first be 'cleaned up' to only include elements of interest. But first import the necessary packages:

In [None]:
import pandas as pd
import geopandas as gpd
import folium
import ipyleaflet

Open data. Note file is not in UTF-8 encoding, therefore clean up prior to opening.

In [None]:
# Load with cp1252 (or ISO-8859-1) to avoid Unicode error
repd_data = pd.read_csv('repd-q4-jan-2025.csv', encoding='cp1252')

# Then normalize any weird characters if needed
repd_data = repd_data.map(lambda x: x.replace('\xa0', ' ') if isinstance(x, str) else x)

repd_data.head() # show raw data

Filter data to include only wind turbines

Filter out unneccessary data columns (e.g. related to other technologies, reference numbers used only by REPD data etc)

In [None]:
repd_data = repd_data[repd_data['Technology Type'] == 'Wind Onshore']
repd_wind = repd_data.drop(columns=['Old Ref ID', 'Ref ID', 'Technology Type', 
                                    'Storage Type', 'CHP Enabled', 'Storage Co-location REPD Ref ID', 
                                    'Share Community Scheme', 'CfD Allocation Round', 'RO Banding (ROC/MWh)', 
                                    'CfD Capacity (MW)', 'Mounting Type for Solar', 'Are they re-applying (New REPD Ref)', 
                                    'Are they re-applying (Old REPD Ref) ', 'Development Status', 'Offshore Wind Round', 
                                    'Heat Network Ref', 'Solar Site Area (sqm)'])
repd_wind.head() #check if successful

In [None]:
# Check for suspicious characters in remaining columns:

for col in repd_wind.columns:
    if repd_wind[col].dtype == object:
        if repd_wind[col].str.contains(r'\\[0-9]').any():
            print(f"Suspicious escape sequence in column: {col}")


In [None]:
# Fix suspicious characters to ensure map generates correctly by escaping backslashes in all string/object columns
for col in repd_wind.select_dtypes(include='object').columns:
    repd_wind[col] = repd_wind[col].str.replace(r'\\', r'\\\\', regex=True)

In [None]:
# Check for non-numeric entries in the coordinate columns
non_numeric_x = repd_wind[~repd_wind['X-coordinate'].apply(pd.to_numeric, errors='coerce').notnull()]
non_numeric_y = repd_wind[~repd_wind['Y-coordinate'].apply(pd.to_numeric, errors='coerce').notnull()]

# Display rows with non-numeric coordinates and include row indices
print("Non-numeric X-coordinates:")
print(non_numeric_x[['X-coordinate', 'Y-coordinate']])

print("Non-numeric Y-coordinates:")
print(non_numeric_y[['X-coordinate', 'Y-coordinate']])

# Drop rows with NaN coordinates
repd_wind = repd_wind.dropna(subset=['X-coordinate', 'Y-coordinate'])

Now convert the tabular data into vector format using the 'X-coordinate' and 'Y-coordinate' columns.

In [None]:
# Create the GeoDataFrame
wind_turbines = gpd.GeoDataFrame(repd_wind,
    geometry=gpd.points_from_xy(repd_wind['X-coordinate'], repd_wind['Y-coordinate']), # Create geometry from X, Y coordinate columns
    crs='epsg:27700') # set ESPG to British National Grid

wind_turbines.head() # Check if successful

In [None]:
# Folium is best used in a geographic coordinate system such as WGS 84 (epsg:4326). Make a copy of the GeoDataFrame with WGS 84:

# Use WGS 84 (epsg:4326) as the geographic coordinate system
wind_turbines_4326 = wind_turbines.to_crs(epsg=4326)
print(wind_turbines_4326.crs)

# Delete unneccessary coordinate columns:
wind_turbines_4326 = wind_turbines_4326.drop(columns=['X-coordinate', 'Y-coordinate'])

# Change NaN values for string columns to string N/A values
str_columns = wind_turbines_4326.select_dtypes(include=['object'])

# Fill NaN values in string columns with 'N/A'
wind_turbines_4326[str_columns.columns] = str_columns.fillna('N/A')
wind_turbines_4326

Check for invalid coordinates and geometries

In [None]:
# Check for any data outside valid longitude (-180 to 180) or latitude (-90 to 90)
invalid_coords = wind_turbines_4326[
    (wind_turbines_4326.geometry.x < -180) |
    (wind_turbines_4326.geometry.x > 180) |
    (wind_turbines_4326.geometry.y < -90) |
    (wind_turbines_4326.geometry.y > 90)]

print(f"Out-of-bounds coordinates: {len(invalid_coords)}")
invalid_coords[['Site Name', 'geometry']]

In [None]:
# Check for invalid geometries
invalid_geom = wind_turbines_4326[
    wind_turbines_4326.geometry.is_empty | wind_turbines_4326.geometry.isna()]

print(f"Invalid geometries: {len(invalid_geom)}")
invalid_geom[['Site Name', 'geometry']]

In [None]:
# Filter out invalid geometries and infinity coordinates
wind_turbines_4326 = wind_turbines_4326[
    wind_turbines_4326.geometry.notnull() &
    wind_turbines_4326.geometry.is_valid &
    wind_turbines_4326.geometry.apply(lambda geom: geom.is_empty == False) &
    wind_turbines_4326.geometry.apply(lambda geom: geom.x != float("inf") and geom.y != float("inf"))]

In [None]:
# Make a folium map to check if successful
m = wind_turbines_4326.explore(
    column='Development Status (short)',
    marker_type='marker',
    popup=True,
    legend=False,
    tiles='OpenStreetMap')

m # show map

In [None]:
# Save cleaned up dataset for use in next step, use geopackage format to preserve column name lengths
wind_turbines_4326.to_file('wind_turbines.gpkg', driver='GPKG')