In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
import networkx as nx
from datetime import datetime, timedelta
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Loading

### Shark Data

In [None]:
df_sharks = pd.read_csv(r'/content/drive/Othercomputers/My Laptop/Node clustering//Data/Sharks.csv')
df_sharks.head(2)

  cast_date_col = pd.to_datetime(column, errors="coerce")


Unnamed: 0,Shark,Transmitter,Specie,Sex,Length_cm,Release date,Killed,date_killed,Latitude_RP,Longitude_RP,Latitude,Longitude
0,B01F,8806,Bull,F,255,27/11/2018,No,,-22.60358,166.49282,-22.2758,166.3939
1,B07F,8807,Bull,F,275,10/12/2019,No,,-22.7411,166.67033,-22.2874,166.3806


In [None]:
shark_attributes = df_sharks[['Shark', 'Specie', 'Sex',	'Length_cm'
                              ,	'Killed']].set_index('Shark').to_dict('index')

In [None]:
Sharks = df_sharks['Shark'].unique()
print(Sharks)
print(len(Sharks))

['B01F' 'B07F' 'B02F' 'B04F' 'B09M' 'B03F' 'B05F' 'B06F' 'B08F' 'T01F'
 'B17M' 'B11F' 'B10M' 'T02F' 'B12F' 'B13F' 'B14M' 'B15F' 'B16F' 'T03M'
 'B18M' 'T05F' 'T07F' 'T10F' 'T04F' 'B19F' 'T06F' 'T08F' 'T11F' 'T09F']
30


### Station Data

In [None]:
df_stations = pd.read_csv(r'/content/drive/Othercomputers/My Laptop/Node clustering//Data/Stations.csv')
df_stations.head(2)

Unnamed: 0,Receiver,Station,Area,Depth_m,Latitude,Longitude,StudyStarts,LastDataDownload,GrandZone1,GrandZone2,Place
0,N05,H1,Grand Harbor,7.0,-22.266747,166.423083,28/11/2018,20/10/2023,Quai,Quai,QuaiPecheur
1,N01,H2,Grand Harbor,20.5,-22.237617,166.397767,04/12/2018,23/08/2022,EntreeGrandRade,EntreeGrandRade,GrandRade


In [None]:
Stations = df_stations['Station'].unique()
Stations = Stations
print(Stations)
print(len(Stations))

['H1' 'H2' 'H3' 'H4' 'C1' 'C2' 'C3' 'P1' 'V1' 'V2' 'V3' 'IC1' 'SM1' 'SM2'
 'SM3' 'SM4' 'SM5' 'M1' 'IM1' 'M2' 'D1' 'IS1' 'IS3' 'IS2' 'C4' 'C5' 'C6'
 'C7' 'V4' 'V5' 'V6' 'V7' 'V8' 'V9' 'V10' 'SM6' 'SM7' 'IC2' 'IC3' 'P2'
 'P3' 'P4' 'P5' 'P6']
44


### Detection Data

In [None]:
df_detections = pd.read_csv(r'/content/drive/Othercomputers/My Laptop/Node clustering//Data/Detections.csv')
df_detections['Localtime'] = pd.to_datetime(df_detections['Localtime'], format="%Y-%m-%d %H:%M:%S")
df_detections.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1363210 entries, 0 to 1363209
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   Transmitter  1363210 non-null  int64         
 1   Receiver     1363210 non-null  object        
 2   Localtime    1363210 non-null  datetime64[ns]
 3   Shark        1363210 non-null  object        
 4   Station      1363210 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 52.0+ MB


In [None]:
stations = df_detections['Station'].unique()

for station in stations:
    df = df_detections[df_detections['Station'] == station].sort_values(by='Localtime').reset_index(drop=True)

    blocks = []
    current_shark = None
    block_start = None
    block_end = None

    for i in range(len(df)):
        row = df.iloc[i]
        shark = row['Shark']
        time = row['Localtime']

        if current_shark is None:
            current_shark = shark
            block_start = time
            block_end = time
            continue

        if shark == current_shark:
            block_end = time
        else:
            # Save the previous block
            blocks.append({
                'Shark': current_shark,
                'First Seen': block_start if len(blocks) > 0 else pd.NaT,
                'Last Seen Before Next Shark': block_end,
                'Station': station
            })
            # Start new block
            current_shark = shark
            block_start = time
            block_end = time

    # Save the last block
    blocks.append({
        'Shark': current_shark,
        'First Seen': block_start,
        'Last Seen Before Next Shark': block_end,
        'Station': station
    })

    # Create dynamic block dataframe
    globals()[f"{station}_df_blocks"] = pd.DataFrame(blocks)


In [None]:
Areas = df_stations['Area'].unique()
print(Areas)
print(len(Areas))

['Grand Harbor' 'Citrons Bay Wide Beach' 'Small Harbor'
 'Vata Bay Wide Beach' 'Islets' 'Saint Marie Bay'
 'Saint Marie Bay Nautical Center' 'Magenta Bay' 'Dumbea Bay'
 'Citrons Bay Beachfront' 'Vata Bay Beachfront']
11


# Graph Construction

In [None]:
from time import time

# Use an undirected graph
G = nx.Graph()

# Add shark nodes with attributes
for shark, attributes in shark_attributes.items():
    G.add_node(shark, **attributes)

# Process each station
for station in Stations:
    print("-----------------------------------------------")
    print(f"Processing Station: {station}")
    beginning = time()

    df = globals()[f"{station}_df_blocks"]
    station_area = df_stations[df_stations['Station'] == station]['Area'].values[0]

    end = 1
    for start in range(len(df) - 1):
        shark1 = df.iloc[start]['Shark']
        shark1_last_seen = df.iloc[start]['Last Seen Before Next Shark']

        shark2 = df.iloc[end]['Shark']
        shark2_first_seen = df.iloc[end]['First Seen']

        time_diff = (shark2_first_seen - shark1_last_seen).total_seconds() / 60

        if time_diff <= 60:  # 60 minutes threshold
            # Build area presence dictionary
            area_attributes = {}
            for area_name in Areas:
                safe_key = area_name.replace(" ", "_")
                area_attributes[safe_key] = int(station_area == area_name)

            # Use sorted tuple to ensure consistent key for undirected edge
            u, v = sorted([shark1, shark2])

            if G.has_edge(u, v):
                G[u][v]['weight'] += 1
                G[u][v]['min_duration_m'] = round(min(G[u][v]['min_duration_m'], time_diff), 2)
                G[u][v]['max_duration_m'] = round(max(G[u][v]['max_duration_m'], time_diff), 2)

                # Update area presence
                for area_name in Areas:
                    safe_key = area_name.replace(" ", "_")
                    G[u][v][safe_key] = max(G[u][v].get(safe_key, 0), area_attributes[safe_key])
            else:
                G.add_edge(
                    u, v,
                    weight=1,
                    min_duration_m=round(time_diff, 2),
                    max_duration_m=round(time_diff, 2),
                    **area_attributes
                )

        end += 1

    ending = time()
    print(station, df.shape[0], round((ending - beginning) / 60, 2), 'min')


-----------------------------------------------
Processing Station: H1
H1 168094 1.04 min
-----------------------------------------------
Processing Station: H2
H2 312 0.0 min
-----------------------------------------------
Processing Station: H3
H3 3554 0.02 min
-----------------------------------------------
Processing Station: H4
H4 1582 0.01 min
-----------------------------------------------
Processing Station: C1
C1 494 0.0 min
-----------------------------------------------
Processing Station: C2
C2 810 0.01 min
-----------------------------------------------
Processing Station: C3
C3 420 0.0 min
-----------------------------------------------
Processing Station: P1
P1 2347 0.02 min
-----------------------------------------------
Processing Station: V1
V1 551 0.0 min
-----------------------------------------------
Processing Station: V2
V2 220 0.0 min
-----------------------------------------------
Processing Station: V3
V3 233 0.0 min
-------------------------------------------

In [None]:
print("Number of Edges:", len(G.edges(data=True)))

Number of Edges: 164


In [None]:
# Output the graph structure
print("Nodes:", G.nodes)
print("Edges:", G.edges(data=True))

Nodes: ['B01F', 'B07F', 'B02F', 'B04F', 'B09M', 'B03F', 'B05F', 'B06F', 'B08F', 'T01F', 'B17M', 'B11F', 'B10M', 'T02F', 'B12F', 'B13F', 'B14M', 'B15F', 'B16F', 'T03M', 'B18M', 'T05F', 'T07F', 'T10F', 'T04F', 'B19F', 'T06F', 'T08F', 'T11F', 'T09F']
Edges: [('B01F', 'B02F', {'weight': 2522, 'min_duration_m': 0.0, 'max_duration_m': 44.0, 'Grand_Harbor': 1, 'Citrons_Bay_Wide_Beach': 0, 'Small_Harbor': 0, 'Vata_Bay_Wide_Beach': 0, 'Islets': 0, 'Saint_Marie_Bay': 0, 'Saint_Marie_Bay_Nautical_Center': 0, 'Magenta_Bay': 0, 'Dumbea_Bay': 0, 'Citrons_Bay_Beachfront': 0, 'Vata_Bay_Beachfront': 0}), ('B01F', 'B03F', {'weight': 1604, 'min_duration_m': 0.0, 'max_duration_m': 50.7, 'Grand_Harbor': 1, 'Citrons_Bay_Wide_Beach': 1, 'Small_Harbor': 1, 'Vata_Bay_Wide_Beach': 0, 'Islets': 0, 'Saint_Marie_Bay': 0, 'Saint_Marie_Bay_Nautical_Center': 0, 'Magenta_Bay': 0, 'Dumbea_Bay': 0, 'Citrons_Bay_Beachfront': 0, 'Vata_Bay_Beachfront': 0}), ('B01F', 'B04F', {'weight': 1048, 'min_duration_m': 0.0, 'max_dura

In [None]:
graph_path = f"/content/drive/Othercomputers/My Laptop/Node clustering/Association rules/Graph.gml"
nx.write_gml(G, graph_path)

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import pandas as pd
import numpy as np

# Extract edge data
edges_data = list(G.edges(data=True))

# Define area features and clean labels
area_features = [
    'Dumbea_Bay', 'Grand_Harbor', 'Small_Harbor', 'Citrons_Bay_Wide_Beach',
    'Citrons_Bay_Beachfront', 'Vata_Bay_Wide_Beach', 'Vata_Bay_Beachfront',
    'Islets', 'Saint_Marie_Bay', 'Saint_Marie_Bay_Nautical_Center', 'Magenta_Bay'
]
area_labels = [feature.replace('_', ' ') for feature in area_features]

# Extract edges and create edge labels
edge_pairs = []
feature_matrix = []

for edge in edges_data:
    node1, node2, attrs = edge
    edge_label = f"{node1}-{node2}"
    edge_pairs.append(edge_label)
    feature_values = [attrs.get(feature, 0) for feature in area_features]
    feature_matrix.append(feature_values)

feature_matrix = np.array(feature_matrix)

# Build DataFrame for plotting
plot_data = []
for i, edge_label in enumerate(edge_pairs):
    for j, feature_value in enumerate(feature_matrix[i]):
        if feature_value == 1:
            plot_data.append({
                'Edge': edge_label,
                'Area Feature': area_labels[j],
                'Presence': 'Yes'
            })

df_plot = pd.DataFrame(plot_data)

# Split into top and bottom edge sets
unique_edges = df_plot['Edge'].unique()
midpoint = len(unique_edges) // 2
edges_top = unique_edges[:midpoint]
edges_bottom = unique_edges[midpoint:]

df_top = df_plot[df_plot['Edge'].isin(edges_top)]
df_bottom = df_plot[df_plot['Edge'].isin(edges_bottom)]

# Create vertically stacked subplots
fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=("Edges (Top Half)", "Edges (Bottom Half)"),
    shared_yaxes=True,
    vertical_spacing=0.1
)

# Add top subplot
fig.add_trace(
    go.Scatter(
        x=df_top['Edge'],
        y=df_top['Area Feature'],
        mode='markers',
        marker=dict(size=12, color='green', symbol='square'),
        name='Top'
    ),
    row=1, col=1
)

# Add bottom subplot
fig.add_trace(
    go.Scatter(
        x=df_bottom['Edge'],
        y=df_bottom['Area Feature'],
        mode='markers',
        marker=dict(size=12, color='green', symbol='square'),
        name='Bottom'
    ),
    row=2, col=1
)

# Set axis layout
fig.update_yaxes(
    title_text="Area Features",
    categoryorder='array',
    categoryarray=area_labels,
    showgrid=True,
    gridcolor='lightgray',
    zeroline=False,
    linecolor='gray',
    linewidth=1
)

fig.update_xaxes(
    tickangle=45,
    showgrid=True,
    gridcolor='lightgray',
    zeroline=False,
    linecolor='gray',
    linewidth=1
)

# Layout adjustments
fig.update_layout(
    height=1000,
    width=2000,
    title_text="",
    showlegend=False,
    plot_bgcolor='white',
    paper_bgcolor='white',
    margin=dict(t=50, b=50)
)

fig.show()
