In [None]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from loader import PathDataModule
from tqdm import tqdm

# --- 1. Configuration and Data Loading ---
config_path = 'config_score.json'
config_data = json.load(open(config_path, 'r'))
print(config_data)

{'dataset': 'icews14', 'storage_dir': '../data/', 'embedding_config': './embedding_config.json', 'criteria': 'score', 'train_ratio': 0.3, 'hidden_dim': 256, 'max_hops': 4, 'num_neg': 50, 'num_threads': 8, 'decay_factor': 0.2, 'max_fanout': 100, 'beam_width': 20, 'max_epochs': 2, 'batch_size': 8, 'dim_feedforward': 256, 'nhead': 4, 'num_layers': 6, 'dropout': 0.1, 'store': 'model', 'save_text_embeddings': True, 'shallow': True, 'pre_scan': ['train'], 'adjust_no_neg_paths_samples': True, 'max_adjust': 5.0, 'positive_deviation': True, 'embedding': 'all', 'test_time': 1, 'num_ckpt': 2, 'scale_loss': True, 'chi2': False, 'lr': 0.0001, 'wandb_project': 'thesis-graph', 'neg_per_pos': 10}


In [None]:
import requests, json, pprint

cid  = os.getenv("CONTAINER_ID")          # ← set by Vast.ai inside every container
key  = os.getenv("CONTAINER_API_KEY")     # ← scoped token for this one instance
assert cid and key, "Not running on a Vast.ai container!"

resp = requests.get(
    f"https://console.vast.ai/api/v0/instances/{cid}/",
    headers={"Authorization": f"Bearer {key}",
             "accept": "application/json"},
    timeout=10,
)

info = resp.json()
print("Effective vCPUs:", info['instances']["cpu_cores_effective"])
# print(info['instances'].keys())

Effective vCPUs: 12.0


In [3]:

# Initialize the DataModule and load the data
print("Setting up DataModule...")
dm = PathDataModule(config_path=config_path, batch_size=32)
dm.setup('fit')
print("Data loaded.")


Setting up DataModule...
Setting up data for stage: fit
Setting up data for split: train
Pre-scan enabled for train split. Running full data validation...

--- Pre-scanning and filtering train data points ---
Scanning 577654 edges in train split...


Scanning edges: 100%|██████████| 577654/577654 [00:00<00:00, 1228830.32it/s]



Pre-scan Results for train:
  Total edges scanned: 577654
  Valid edges (has pos & neg paths): 0 (0.0%)
  Missing positive paths: 177891 (30.8%)
  Missing negative paths: 577654 (100.0%)
  Empty negative paths: 0 (0.0%)

  Filtering train split to keep only 0 valid edges.
  New edge count for train: 0
--- Pre-scan complete ---

Use shallow embeddings: True at config ./embedding_config.json
Loading KGE model proxy for train split from ../data/transe_icews14_all_config.json
KGEModelProxy initialized on cuda
Loaded model state from ../data/transe_icews14_all_model.pt to cuda
Device for KGE model: cuda:0
Loaded 0 edges for train split.
Setting up data for split: valid
Pre-scan not configured for valid split. Skipping data validation.
Use shallow embeddings: True at config ./embedding_config.json
Loading KGE model proxy for valid split from ../data/transe_icews14_all_config.json
KGEModelProxy initialized on cuda
Loaded model state from ../data/transe_icews14_all_model.pt to cuda
Device for

In [37]:

# --- 2. Data Extraction and Preparation ---
plot_data = []
split = 'test'  # You can change this to 'valid' or 'test'

print(f"Processing data for '{split}' split...")
edges_df = dm.data[split]
pos_paths = dm.pos_paths[split]
neg_paths = dm.neg_paths[split]


Processing data for 'test' split...


In [38]:
print(len(pos_paths), len(neg_paths))

74747 0


In [None]:

for eid, row in tqdm(edges_df.iterrows(), total=len(edges_df), desc="Extracting paths"):
    eid_str = str(eid)
    label = row['label']
    u = row['u']
    v = row['v']
    ts = row['ts']
    v_pos = row['v_pos']
    edge_type = row['edge_type']
    edge_meta = {}

    # Process positive path
    if eid_str in pos_paths and pos_paths[eid_str].get('nodes'):
        pos_path_len = len(pos_paths[eid_str]['nodes'])
        edge_meta = {
            'path_length': pos_path_len,
            # 'path_type': 'positive',
            'label': 'true_link' if label == 1 else 'false_link',
            'u': u, 'v': v, 'ts': ts, 'edge_type': edge_type,
            'v_pos': v_pos
            
            
        }
    else:
        edge_meta = {
            'path_length': 0,
            # 'path_type': 'positive',
            'label': 'true_link' if label == 1 else 'false_link',
            'u': u, 'v': v, 'ts': ts, 'edge_type': edge_type,
            'v_pos': v_pos
        }

    # # Process negative paths
    # if eid_str in neg_paths:
    #     # for neg_path_interleaved in neg_paths[eid_str]:
    #     #     # As per loader.py, nodes are at even indices
    #     #     neg_path_len = len(neg_path_interleaved[::2])
    #     #     plot_data.append({
    #     #         'path_length': neg_path_len,
    #     #         'path_type': 'negative',
    #     #         'label': 'true_link' if label == 1 else 'false_link'
    #     #     })
    #     edge_meta["num_neg"] = len(neg_paths[eid_str])

    plot_data.append(edge_meta)

Extracting paths: 100%|██████████| 81081/81081 [00:02<00:00, 27550.19it/s]


In [40]:

plot_df = pd.DataFrame(plot_data)
print("Data prepared for plotting.")


Data prepared for plotting.


In [41]:
plot_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81081 entries, 0 to 81080
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   path_length  81081 non-null  int64 
 1   label        81081 non-null  object
 2   u            81081 non-null  int64 
 3   v            81081 non-null  int64 
 4   ts           81081 non-null  int64 
 5   edge_type    81081 non-null  int64 
dtypes: int64(5), object(1)
memory usage: 3.7+ MB


In [None]:
plot_df.describe(include="all")

In [42]:
plot_df

Unnamed: 0,path_length,label,u,v,ts,edge_type
0,2,true_link,25,15,8016,41
1,2,true_link,5737,74,8016,9
2,2,true_link,88,8,8016,1
3,2,true_link,891,649,8016,9
4,2,true_link,132,115,8016,37
...,...,...,...,...,...,...
81076,5,false_link,3509,3857,8736,18
81077,4,false_link,3509,1756,8736,18
81078,6,false_link,3509,5094,8736,18
81079,5,false_link,3509,2055,8736,18


In [43]:
pos_df = plot_df[plot_df["path_length"] > 0].copy()
pos_df

Unnamed: 0,path_length,label,u,v,ts,edge_type
0,2,true_link,25,15,8016,41
1,2,true_link,5737,74,8016,9
2,2,true_link,88,8,8016,1
3,2,true_link,891,649,8016,9
4,2,true_link,132,115,8016,37
...,...,...,...,...,...,...
81076,5,false_link,3509,3857,8736,18
81077,4,false_link,3509,1756,8736,18
81078,6,false_link,3509,5094,8736,18
81079,5,false_link,3509,2055,8736,18


In [44]:
temp = pos_df[pos_df["label"] == "true_link"].groupby("path_length").count()
temp

Unnamed: 0_level_0,label,u,v,ts,edge_type
path_length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,5366,5366,5366,5366,5366
3,1389,1389,1389,1389,1389
4,250,250,250,250,250
5,64,64,64,64,64
6,7,7,7,7,7
7,4,4,4,4,4


In [45]:
temp / temp.sum() * 100

Unnamed: 0_level_0,label,u,v,ts,edge_type
path_length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,75.79096,75.79096,75.79096,75.79096,75.79096
3,19.618644,19.618644,19.618644,19.618644,19.618644
4,3.531073,3.531073,3.531073,3.531073,3.531073
5,0.903955,0.903955,0.903955,0.903955,0.903955
6,0.09887,0.09887,0.09887,0.09887,0.09887
7,0.056497,0.056497,0.056497,0.056497,0.056497


In [46]:
pos_df[pos_df["label"] == "false_link"].groupby("path_length").count()

Unnamed: 0_level_0,label,u,v,ts,edge_type
path_length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,12314,12314,12314,12314,12314
4,25785,25785,25785,25785,25785
5,22337,22337,22337,22337,22337
6,5418,5418,5418,5418,5418
7,1813,1813,1813,1813,1813


In [None]:
def calculate_metrics(group):
    """
    Calculates MRR and Hits@K for a group of predictions for a single query.
    The group contains one 'true_link' and multiple 'false_link' rows.
    A lower 'path_length' is considered a better score.
    """
    true_link = group[group['label'] == 'true_link']
    if true_link.empty:
        return pd.Series({
            'rank': 0, 'mrr': 1, 'hits@1': 1, 'hits@3': 1, 'hits@10': 1
        })

    # Lower path_length is better.
    true_path_length = true_link['path_length'].min()
    
    # Rank is 1 + number of negative samples with a better (smaller) or equal path length.
    # We use '<=' because if scores are tied, the true link does not get the best rank.
    rank = 1 + group[(group['label'] == 'false_link') & (group['path_length'] < true_path_length)].shape[0]
    
    mrr = 1.0 / rank
    hits_at_1 = 1.0 if rank <= 1 else 0.0
    hits_at_3 = 1.0 if rank <= 3 else 0.0
    hits_at_10 = 1.0 if rank <= 10 else 0.0
    
    return pd.Series({
        'rank': rank,
        'mrr': mrr,
        'hits@1': hits_at_1,
        'hits@3': hits_at_3,
        'hits@10': hits_at_10
    })


In [None]:

# Group by the query identifiers.
# This assumes that for each (u, v, ts, edge_type) combination that forms a true link,
# there are corresponding negative samples that share these identifiers in some way
# that allows grouping. If negative samples have different u or v, the grouping key needs adjustment.
# Based on the request, we group by ('u', 'v', 'ts', 'edge_type').
grouped = plot_df.groupby(['u', 'v_pos', 'ts', 'edge_type'])


In [None]:

# Apply the function to each group and get the results.
metrics_df = grouped.apply(calculate_metrics).reset_index()


In [None]:

print("Metrics per query:")
metrics_df


In [None]:

print("\nAverage metrics across all queries:")
metrics_df[['mrr', 'hits@1', 'hits@3', 'hits@10']].mean() * 100

In [None]:
metrics_df['rank'].plot.hist(bins=10, edgecolor='black', alpha=0.7)

In [None]:
metrics_df.groupby('rank').count()

In [None]:
# edges_fp = os.path.join(config_data['storage_dir'], f"{config_data['dataset']}_edges.csv")
# df = pd.read_csv(edges_fp, index_col='edge_id')            
# split_map = {str(idx): row['split'] for idx, row in df.iterrows()}

In [None]:
# df.info()

In [None]:
# len(split_map.keys())

In [None]:
# list(split_map.values())[-1]

In [None]:
# split_code = {'pre': 0, 'train': 1, 'valid': 2, 'test': 3}

# print(f"Setting up data for split: {split}")

# pos_paths = {}
# with open(os.path.join(config_data['storage_dir'], f"{config_data['dataset']}_paths.txt")) as f:
#     n_str = f.readline()
#     n = int(n_str) if n_str and n_str.strip() else 0
#     for _ in tqdm(range(n)):
#         eid = f.readline().strip()
#         if not eid:
#             break
#         hops = int(f.readline())
#         nodes = [int(u) for u in f.readline().split()]
#         node_types = [int(t) for t in f.readline().split()]
#         edge_types_str = f.readline().strip().split()
#         edge_types = [int(et) for et in edge_types_str if et]
        
#         edge_timestamps_str = f.readline().strip().split()
#         edge_timestamps = [int(ts) for ts in edge_timestamps_str if ts]

#         if split_map.get(eid) == split_code[split]:
#             pos_paths[eid] = {
#                 "hops": hops,
#                 "nodes": nodes,
#                 "node_types": node_types,
#                 "edge_types": edge_types,
#                 "edge_timestamps": edge_timestamps
#             }


In [None]:
# len(pos_paths.keys())