In [8]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

In [9]:
#Trying to merge the two datasets NOAA + GCBD by coordinates - does not work because we don't have the correct stations, you can see the distances are too far away, or the way of calculation is wrong??

In [None]:
# this is our combined NOAA for the selected stations
noaa_data = pd.read_csv('coral_reef_data_combined.csv')
#GCBD dataset
gcbd_data = pd.read_csv('gcbd_cleaned.csv', low_memory=False)

In [14]:
# getting the stations from NOAA (should be 9)
noaa_stations = noaa_data[['Station', 'Region', 'Latitude', 'Longitude']].drop_duplicates()
print(f"NOAA stations: {len(noaa_stations)}")
print(noaa_stations)


NOAA stations: 9
                      Station              Region  Latitude  Longitude
0            Far Northern GBR  Great Barrier Reef   -12.675    144.100
14751           Torres Strait           Caribbean    -9.950    143.650
29500            Northern GBR  Great Barrier Reef   -16.100    145.975
44251                  Samoas           Polynesia   -12.825   -170.475
59001   Northern Cook Islands           Polynesia   -10.950   -165.200
73752   Main Hawaiian Islands           Caribbean    20.575   -157.700
88503               Nicaragua           Caribbean    14.125    -81.750
103254   Panama Atlantic East           Caribbean     9.200    -78.700
118005                Jamaica           Caribbean    17.675    -77.250


In [15]:
# calculate distance between coordinates (Haversine formula)
def haversine_distance(lat1, lon1, lat2, lon2):
    # Convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Radius of earth in kilometers
    return c * r


In [16]:
# For each NOAA station, find nearby GCBD sites - I set the distance to 100km for now just ot see what we have
def find_nearby_gcbd_sites(station_lat, station_lon, gcbd_df, max_distance_km=100):
    distances = []
    for idx, row in gcbd_df.iterrows():
        dist = haversine_distance(station_lat, station_lon, 
                                row['Latitude_Degrees'], row['Longitude_Degrees'])
        if dist <= max_distance_km:
            distances.append({
                'GCBD_Index': idx,
                'Distance_km': dist,
                'GCBD_Site': row.get('Site_Name', 'Unknown'),
                'GCBD_Country': row.get('Country_Name', 'Unknown'),
                'GCBD_Lat': row['Latitude_Degrees'],
                'GCBD_Lon': row['Longitude_Degrees']
            })
    
    return sorted(distances, key=lambda x: x['Distance_km'])

In [18]:
# trying to match the NOAA station with GCBD sites
station_matches = {}

for idx, station in noaa_stations.iterrows():
    station_name = station['Station']
    station_lat = station['Latitude']
    station_lon = station['Longitude']
    
    nearby_sites = find_nearby_gcbd_sites(station_lat, station_lon, gcbd_clean, max_distance_km=100)
    
    station_matches[station_name] = {
        'noaa_info': station,
        'nearby_gcbd_sites': nearby_sites,
        'match_count': len(nearby_sites)
    }
    
    print(f"\n {station_name} ({station['Region']}):")
    print(f"   Coordinates: ({station_lat:.3f}, {station_lon:.3f})")
    print(f"   Nearby GCBD sites: {len(nearby_sites)}")
    
    if nearby_sites:
        print(f"   Closest matches:")
        for site in nearby_sites[:3]:  # Show top 3 closest
            print(f"     • {site['GCBD_Site']} ({site['GCBD_Country']}) - {site['Distance_km']:.1f}km away")
    else:
        print(f"No GCBD sites found within given radius")


 Far Northern GBR (Great Barrier Reef):
   Coordinates: (-12.675, 144.100)
   Nearby GCBD sites: 13
   Closest matches:
     • nan (Australia) - 53.8km away
     • nan (Australia) - 58.7km away
     • nan (Australia) - 63.5km away

 Torres Strait (Caribbean):
   Coordinates: (-9.950, 143.650)
   Nearby GCBD sites: 0
No GCBD sites found within given radius

 Northern GBR (Great Barrier Reef):
   Coordinates: (-16.100, 145.975)
   Nearby GCBD sites: 471
   Closest matches:
     • nan (Australia) - 11.2km away
     • nan (Australia) - 11.2km away
     • nan (Australia) - 12.7km away

 Samoas (Polynesia):
   Coordinates: (-12.825, -170.475)
   Nearby GCBD sites: 0
No GCBD sites found within given radius

 Northern Cook Islands (Polynesia):
   Coordinates: (-10.950, -165.200)
   Nearby GCBD sites: 0
No GCBD sites found within given radius

 Main Hawaiian Islands (Caribbean):
   Coordinates: (20.575, -157.700)
   Nearby GCBD sites: 39
   Closest matches:
     • nan (United States) - 76.5km 

In [19]:
# summary 
total_matches = sum([info['match_count'] for info in station_matches.values()])
stations_with_matches = sum([1 for info in station_matches.values() if info['match_count'] > 0])

print(f"Total NOAA stations: {len(noaa_stations)}")
print(f"Stations with nearby GCBD sites: {stations_with_matches}")
print(f"Total GCBD sites within specified radius of stations: {total_matches}")

Total NOAA stations: 9
Stations with nearby GCBD sites: 6
Total GCBD sites within specified radius of stations: 1373


In [21]:
# create a new dataset - filtered GCBD dataset with only sites near NOAA stations - this is just an example of how we could combine it into one once we have to correct stations and radius
print(f"\nCREATING FILTERED GCBD DATASET...")
nearby_gcbd_indices = set()
for station_info in station_matches.values():
    for site in station_info['nearby_gcbd_sites']:
        nearby_gcbd_indices.add(site['GCBD_Index'])

gcbd_filtered = gcbd_clean.loc[list(nearby_gcbd_indices)].copy()
print(f"Filtered GCBD dataset: {len(gcbd_filtered)} sites")


CREATING FILTERED GCBD DATASET...
Filtered GCBD dataset: 1373 sites


In [22]:
# save the new dataset
gcbd_filtered.to_csv('gcbd_filtered_near_noaa_stations.csv', index=False)
print(f"Saved filtered GCBD data to 'gcbd_filtered_near_noaa_stations.csv'")

Saved filtered GCBD data to 'gcbd_filtered_near_noaa_stations.csv'


In [23]:
# matching reference table
match_records = []
for station_name, info in station_matches.items():
    for site in info['nearby_gcbd_sites']:
        match_records.append({
            'NOAA_Station': station_name,
            'NOAA_Region': info['noaa_info']['Region'],
            'NOAA_Lat': info['noaa_info']['Latitude'],
            'NOAA_Lon': info['noaa_info']['Longitude'],
            'GCBD_Index': site['GCBD_Index'],
            'GCBD_Site': site['GCBD_Site'],
            'GCBD_Country': site['GCBD_Country'],
            'GCBD_Lat': site['GCBD_Lat'],
            'GCBD_Lon': site['GCBD_Lon'],
            'Distance_km': site['Distance_km']
        })

match_df = pd.DataFrame(match_records)
match_df.to_csv('noaa_gcbd_station_matches.csv', index=False)
print(f"saved matching reference to 'noaa_gcbd_station_matches.csv'")



saved matching reference to 'noaa_gcbd_station_matches.csv'
