# Shark Co-occurrence Analysis using Association Rules

## Objective
Identify frequent co-occurrence patterns among sharks observed at different stations using association rules.

## 1. Load and Parse Data

We load shark co-occurrence data from a CSV file. The 'Relations' column contains comma-separated shark IDs that were observed together.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from csv import reader

# Load the CSV file
shark_df = pd.read_csv('/content/drive/Othercomputers/My Laptop/Node clustering/Association rules/shark_relations.csv')

# Convert the 'Relations' column into a list of lists
transactions = shark_df['Relations'].apply(lambda x: x.split(',')).tolist()

# Show sample transactions
transactions[:5]

[['B13F', 'B02F', 'B14M'],
 ['B16F', 'B06F', 'B05F'],
 ['B13F', 'B02F'],
 ['B12F', 'B09M', 'B11F'],
 ['B04F', 'B01F']]

In [3]:
shark_df

Unnamed: 0,Stations,Relations
0,H3,"B13F,B02F,B14M"
1,H3,"B16F,B06F,B05F"
2,H3,"B13F,B02F"
3,H3,"B12F,B09M,B11F"
4,H3,"B04F,B01F"
...,...,...
2957,C4,"B04F,T03M"
2958,C4,"B04F,B10M"
2959,C4,"T03M,B04F"
2960,C4,"B05F,B09M"


## 2. One-Hot Encode Transactions

We use `TransactionEncoder` from `mlxtend` to transform the list of shark co-occurrences into a format suitable for analysis.

In [4]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,B01F,B02F,B03F,B04F,B05F,B06F,B07F,B08F,B09M,B10M,...,T02F,T03M,T04F,T05F,T06F,T07F,T08F,T09F,T10F,T11F
0,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2957,False,False,False,True,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
2958,False,False,False,True,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2959,False,False,False,True,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
2960,False,False,False,False,True,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


## 3. Generate Frequent Itemsets

We apply the Apriori algorithm to identify frequently co-occurring shark IDs.

In [5]:
from mlxtend.frequent_patterns import apriori

frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)
frequent_itemsets.sort_values(by='support', ascending=False)

Unnamed: 0,support,itemsets
1,0.426739,(B02F)
4,0.337610,(B05F)
8,0.290682,(B09M)
3,0.287981,(B04F)
15,0.275152,(B16F)
...,...,...
22,0.010128,(T05F)
126,0.010128,"(B05F, B09M, B04F)"
70,0.010128,"(B09M, B12F)"
136,0.010128,"(B05F, B06F, B12F)"


In [6]:
length = frequent_itemsets['itemsets'].str.len()
rows = length == 1
frequent_itemsets[rows].sort_values(by='support', ascending=False)

Unnamed: 0,support,itemsets
1,0.426739,(B02F)
4,0.33761,(B05F)
8,0.290682,(B09M)
3,0.287981,(B04F)
15,0.275152,(B16F)
13,0.260635,(B14M)
14,0.243079,(B15F)
5,0.201553,(B06F)
10,0.176907,(B11F)
12,0.148886,(B13F)


In [7]:
length = frequent_itemsets['itemsets'].str.len()
rows = length == 2
frequent_itemsets[rows].sort_values(by='support', ascending=False)

Unnamed: 0,support,itemsets
26,0.185010,"(B05F, B02F)"
35,0.163741,"(B16F, B02F)"
34,0.147535,"(B15F, B02F)"
29,0.146860,"(B09M, B02F)"
73,0.142471,"(B15F, B09M)"
...,...,...
88,0.010804,"(B18M, B14M)"
24,0.010466,"(B01F, B03F)"
31,0.010466,"(B02F, B12F)"
58,0.010466,"(B05F, B18M)"


In [8]:
length = frequent_itemsets['itemsets'].str.len()
rows = length == 3
frequent_itemsets[rows].sort_values(by='support', ascending=False)

Unnamed: 0,support,itemsets
115,0.086090,"(B15F, B09M, B02F)"
104,0.065159,"(B05F, B15F, B02F)"
100,0.062795,"(B05F, B09M, B02F)"
141,0.061107,"(B05F, B15F, B09M)"
105,0.060770,"(B05F, B02F, B16F)"
...,...,...
107,0.011141,"(B06F, B02F, B11F)"
110,0.010804,"(B15F, B02F, B07F)"
109,0.010804,"(B09M, B07F, B02F)"
126,0.010128,"(B05F, B09M, B04F)"


In [9]:
length = frequent_itemsets['itemsets'].str.len()
rows = length == 4
frequent_itemsets[rows].sort_values(by='support', ascending=False)

Unnamed: 0,support,itemsets
174,0.035111,"(B05F, B15F, B09M, B02F)"
183,0.033086,"(B15F, B09M, B14M, B02F)"
166,0.027684,"(B05F, B02F, B16F, B04F)"
185,0.026334,"(B16F, B15F, B09M, B02F)"
181,0.024308,"(B15F, B09M, B02F, B11F)"
189,0.022282,"(B05F, B15F, B09M, B11F)"
177,0.021607,"(B05F, B02F, B16F, B11F)"
170,0.020932,"(B05F, B02F, B16F, B06F)"
188,0.020932,"(B05F, B04F, B16F, B11F)"
179,0.019919,"(B05F, B15F, B02F, B14M)"


In [10]:
length = frequent_itemsets['itemsets'].str.len()
rows = length == 5
frequent_itemsets[rows].sort_values(by='support', ascending=False)

Unnamed: 0,support,itemsets
195,0.013842,"(B16F, B02F, B04F, B11F, B05F)"
196,0.012829,"(B09M, B15F, B02F, B11F, B05F)"
197,0.011479,"(B09M, B16F, B15F, B02F, B14M)"


In [11]:
length = frequent_itemsets['itemsets'].str.len()
rows = length == 6
frequent_itemsets[rows].sort_values(by='support', ascending=False)

Unnamed: 0,support,itemsets


In [12]:
frequent_itemsets.groupby(length)['support'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
itemsets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,24.0,0.131485,0.126944,0.010128,0.030469,0.060095,0.247468,0.426739
2,69.0,0.053861,0.04428,0.010128,0.018569,0.044227,0.080014,0.18501
3,71.0,0.030333,0.016824,0.010128,0.017556,0.026671,0.0395,0.08609
4,31.0,0.018046,0.006294,0.010128,0.013504,0.017556,0.020932,0.035111
5,3.0,0.012717,0.001186,0.011479,0.012154,0.012829,0.013336,0.013842


## 4. Create Association Rules

From the frequent itemsets, we generate association rules to understand which sharks are most likely to be seen together.

In [13]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemsets, metric='zhangs_metric', min_threshold=0.5)
rules1 = rules[['antecedents', 'consequents', 'zhangs_metric']].sort_values(by='zhangs_metric', ascending=False)
rules1 = rules1.reset_index(drop=True)
rules1

Unnamed: 0,antecedents,consequents,zhangs_metric
0,(B03F),(B01F),0.987625
1,(B01F),(B03F),0.976113
2,(B10M),(T01F),0.937560
3,(B17M),(T01F),0.927186
4,"(B16F, B05F)","(B02F, B04F, B11F)",0.920583
...,...,...,...
395,(B15F),"(B02F, B11F)",0.503951
396,"(B02F, B11F)","(B05F, B15F)",0.503809
397,(B07F),"(B15F, B02F)",0.502632
398,(B05F),"(B09M, B02F, B11F)",0.500510


In [16]:
# Step 1: Add length columns
rules1['antecedent_len'] = rules1['antecedents'].apply(len)
rules1['consequence_len'] = rules1['consequents'].apply(len)

# Step 2: Zhang's metric summary by antecedent length
zhang_stats = rules1.groupby('antecedent_len')['zhangs_metric'].describe()

# Step 3: Count of consequence sizes grouped by antecedent length
conseq_counts = rules1.groupby(['antecedent_len', 'consequence_len']).size().unstack(fill_value=0)

# Step 4: Merge both on index (antecedent_len)
final_df = zhang_stats.merge(conseq_counts, left_index=True, right_index=True)

# Optional: Rename consequence columns to match your format
final_df.columns.name = None  # Remove hierarchical name
final_df = final_df.rename(columns={i: f'consequence_size_{i}' for i in conseq_counts.columns})

# Step 5: Display the result
final_df


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max,consequence_size_1,consequence_size_2,consequence_size_3,consequence_size_4
antecedent_len,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,173.0,0.682997,0.122231,0.50051,0.57853,0.669647,0.772331,0.987625,31,69,61,12
2,158.0,0.625078,0.097516,0.500372,0.549663,0.601236,0.691338,0.920583,42,90,26,0
3,60.0,0.630305,0.090926,0.504843,0.558824,0.619188,0.676211,0.861128,34,26,0,0
4,9.0,0.61081,0.059768,0.534603,0.562625,0.582642,0.664583,0.691838,9,0,0,0


In [14]:
length = rules['antecedents'].str.len()
rows = length >= 1
print("Zhangs metric - antecedents")
display(rules1.groupby(length)['zhangs_metric'].describe())

Zhangs metric - antecedents


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
antecedents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,173.0,0.707228,0.119067,0.500372,0.607557,0.707285,0.79954,0.987625
2,158.0,0.626958,0.084878,0.503809,0.557056,0.617099,0.688071,0.840625
3,60.0,0.567893,0.049315,0.50957,0.521586,0.558496,0.603073,0.677415
4,9.0,0.528104,0.011179,0.513007,0.521729,0.524491,0.539024,0.539803


In [17]:
rows = rules1['antecedents'].str.len() == 1
rules1[rows].sort_values(by='zhangs_metric', ascending=False)

Unnamed: 0,antecedents,consequents,zhangs_metric,antecedent_len,consequence_len
0,(B03F),(B01F),0.987625,1,1
1,(B01F),(B03F),0.976113,1,1
2,(B10M),(T01F),0.937560,1,1
3,(B17M),(T01F),0.927186,1,1
5,(T01F),(B10M),0.916499,1,1
...,...,...,...,...,...
389,(B02F),"(B05F, B16F)",0.507865,1,2
393,(B07F),"(B09M, B02F)",0.505088,1,2
395,(B15F),"(B02F, B11F)",0.503951,1,2
397,(B07F),"(B15F, B02F)",0.502632,1,2


In [18]:
rows = rules1['antecedents'].str.len() == 2
rules1[rows].sort_values(by='zhangs_metric', ascending=False)

Unnamed: 0,antecedents,consequents,zhangs_metric,antecedent_len,consequence_len
4,"(B16F, B05F)","(B02F, B04F, B11F)",0.920583,2,3
13,"(B05F, B04F)","(B16F, B02F, B11F)",0.882841,2,3
15,"(B05F, B16F)","(B04F, B11F)",0.878666,2,2
20,"(B16F, B11F)","(B05F, B02F, B04F)",0.861480,2,3
25,"(B16F, B15F)",(B07F),0.847697,2,1
...,...,...,...,...,...
388,"(B15F, B09M)","(B02F, B11F)",0.508497,2,2
390,"(B16F, B14M)",(B15F),0.506568,2,1
391,"(B16F, B04F)","(B05F, B02F)",0.505742,2,2
396,"(B02F, B11F)","(B05F, B15F)",0.503809,2,2


In [19]:
rows = rules1['antecedents'].str.len() == 3
rules1[rows].sort_values(by='zhangs_metric', ascending=False)

Unnamed: 0,antecedents,consequents,zhangs_metric,antecedent_len,consequence_len
21,"(B16F, B02F, B05F)","(B04F, B11F)",0.861128,3,2
24,"(B05F, B02F, B04F)","(B16F, B11F)",0.848694,3,2
26,"(B02F, B04F, B11F)","(B16F, B05F)",0.847687,3,2
31,"(B16F, B02F, B11F)","(B05F, B04F)",0.840625,3,2
53,"(B16F, B05F, B11F)","(B02F, B04F)",0.788016,3,2
79,"(B16F, B04F, B05F)","(B02F, B11F)",0.750899,3,2
84,"(B09M, B02F, B11F)","(B05F, B15F)",0.746117,3,2
91,"(B09M, B14M, B02F)","(B16F, B15F)",0.736192,3,2
96,"(B05F, B15F, B02F)","(B09M, B11F)",0.721199,3,2
107,"(B05F, B09M, B02F)","(B15F, B11F)",0.714356,3,2


In [20]:
rows = rules1['antecedents'].str.len() == 4
rules1[rows].sort_values(by='zhangs_metric', ascending=False)

Unnamed: 0,antecedents,consequents,zhangs_metric,antecedent_len,consequence_len
133,"(B05F, B09M, B02F, B11F)",(B15F),0.691838,4,1
154,"(B16F, B09M, B14M, B02F)",(B15F),0.667649,4,1
162,"(B16F, B02F, B04F, B05F)",(B11F),0.664583,4,1
167,"(B05F, B02F, B04F, B11F)",(B16F),0.66266,4,1
255,"(B05F, B15F, B02F, B11F)",(B09M),0.582642,4,1
273,"(B16F, B02F, B04F, B11F)",(B05F),0.573846,4,1
293,"(B16F, B02F, B05F, B11F)",(B04F),0.562625,4,1
303,"(B16F, B15F, B02F, B14M)",(B09M),0.556842,4,1
338,"(B05F, B15F, B09M, B02F)",(B11F),0.534603,4,1


In [21]:
length = rules['consequents'].str.len()
rows = length >= 1
print("Zhangs metric - consequents ")
display(rules1.groupby(length)['zhangs_metric'].describe())

Zhangs metric - consequents 


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
consequents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,116.0,0.722131,0.126068,0.513007,0.610545,0.712987,0.845192,0.987625
2,185.0,0.647545,0.092749,0.50957,0.565668,0.634926,0.719228,0.83688
3,87.0,0.580288,0.053594,0.503809,0.531713,0.573983,0.627248,0.678858
4,12.0,0.515692,0.010391,0.500372,0.510677,0.514009,0.525267,0.527119


In [22]:
def rule_to_str(row):
    ant = ",".join(sorted(row['antecedents']))
    cons = ",".join(sorted(row['consequents']))
    return f"{ant}=>{cons}"

rules1['rule_str'] = rules1.apply(rule_to_str, axis=1)
rule_columns = rules1['rule_str'].tolist()

# Step 2: Get all unique node codes (e.g., B01F, B03F, etc.)
node_set = set()
for a, c in zip(rules1['antecedents'], rules1['consequents']):
    node_set.update(a)
    node_set.update(c)

nodes = sorted(node_set)

# Step 3: Initialize binary node-feature matrix
X = pd.DataFrame(0, index=nodes, columns=rule_columns)

# Step 4: Fill matrix (1 if node participates in the rule)
for i, row in rules1.iterrows():
    rule_id = row['rule_str']
    participants = set(row['antecedents']).union(set(row['consequents']))
    for node in participants:
        X.at[node, rule_id] = 1
X

Unnamed: 0,B03F=>B01F,B01F=>B03F,B10M=>T01F,B17M=>T01F,"B05F,B16F=>B02F,B04F,B11F",T01F=>B10M,T01F=>B17M,"B15F=>B02F,B05F,B09M,B11F","B16F=>B02F,B04F,B05F,B11F","B16F=>B04F,B05F,B11F",...,"B14M,B16F=>B15F","B04F,B16F=>B02F,B05F","B02F,B14M,B15F=>B09M","B07F=>B02F,B09M","B02F,B11F,B16F=>B05F","B15F=>B02F,B11F","B02F,B11F=>B05F,B15F","B07F=>B02F,B15F","B05F=>B02F,B09M,B11F","B02F,B09M=>B05F,B13F"
B01F,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B02F,0,0,0,0,1,0,0,1,1,0,...,0,1,1,1,1,1,1,1,1,1
B03F,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B04F,0,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,0,0,0,0
B05F,0,0,0,0,1,0,0,1,1,1,...,0,1,0,0,1,0,1,0,1,1
B06F,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B07F,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
B08F,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B09M,0,0,0,0,0,0,0,1,0,0,...,0,0,1,1,0,0,0,0,1,1
B10M,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Your full list of nodes
all_nodes = ['B01F', 'B02F', 'B03F', 'B04F', 'B05F', 'B06F', 'B07F', 'B08F', 'B09M', 'B10M',
             'B11F', 'B12F', 'B13F', 'B14M', 'B15F', 'B16F', 'B17M', 'B18M', 'B19F',
             'T01F', 'T02F', 'T03M', 'T04F', 'T05F', 'T06F', 'T07F', 'T08F', 'T09F', 'T10F', 'T11F',]

# Reindex the matrix to include all expected nodes, filling missing rows with 0s
X = X.reindex(index=all_nodes, fill_value=0)
X.to_csv("/content/drive/Othercomputers/My Laptop/Node clustering/Association rules/node_feature_matrix.csv")
X


Unnamed: 0,B03F=>B01F,B01F=>B03F,B10M=>T01F,B17M=>T01F,"B05F,B16F=>B02F,B04F,B11F",T01F=>B10M,T01F=>B17M,"B15F=>B02F,B05F,B09M,B11F","B16F=>B02F,B04F,B05F,B11F","B16F=>B04F,B05F,B11F",...,"B14M,B16F=>B15F","B04F,B16F=>B02F,B05F","B02F,B14M,B15F=>B09M","B07F=>B02F,B09M","B02F,B11F,B16F=>B05F","B15F=>B02F,B11F","B02F,B11F=>B05F,B15F","B07F=>B02F,B15F","B05F=>B02F,B09M,B11F","B02F,B09M=>B05F,B13F"
B01F,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B02F,0,0,0,0,1,0,0,1,1,0,...,0,1,1,1,1,1,1,1,1,1
B03F,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B04F,0,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,0,0,0,0
B05F,0,0,0,0,1,0,0,1,1,1,...,0,1,0,0,1,0,1,0,1,1
B06F,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B07F,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
B08F,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B09M,0,0,0,0,0,0,0,1,0,0,...,0,0,1,1,0,0,0,0,1,1
B10M,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
