#Here we use the K-12 output to generate a hypergraph; you can replace that with another K output.

In [None]:
!pip install torch pandas numpy

# Step 2: Import libraries
import torch
import numpy as np
import pandas as pd
from typing import Dict, List, Set, Tuple
import os



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def parse_segmented_kmers(segmented_string: str) -> List[str]:
    # Split by semicolon and filter empty strings
    kmers = [k.strip() for k in segmented_string.split(';') if k.strip()]
    return kmers

In [None]:
def build_drugbank_hypergraph(
    csv_path: str,
    k: int = 3,
    save_dir: str = '/content/drive/MyDrive/MLHygnn/DB/hypergraphs/'
) -> Tuple[torch.Tensor, Dict, Dict, Dict]:

    print(f"\n{'='*60}")
    print(f"Building Hypergraph for DrugBank with k={k}")
    print(f"{'='*60}")

    os.makedirs(save_dir, exist_ok=True)

    print(f"Loading data from: {csv_path}")
    df = pd.read_csv(csv_path)
    print(f"Loaded {len(df)} drugs")

    # Find k-mer column
    kmer_col = None
    for col in ['segmented_smile', 'Segmented_SMILE', 'kmers', 'Kmers']:
        if col in df.columns:
            kmer_col = col
            break

    if kmer_col is None:
        raise ValueError(f"No k-mer column found")

    print(f"Using k-mer column: '{kmer_col}'")

    # Extract ALL k-mers
    all_substructures = set()  # For vocabulary only
    drug_substructures = {}     # Keep ALL k-mers per drug

    print("Parsing pre-segmented k-mers...")
    for idx, row in df.iterrows():
        if idx % 100 == 0:
            print(f"  Processing drug {idx+1}/{len(df)}...", end='\r')

        drug_id = row['Drug_ID']
        segmented_kmers = str(row[kmer_col])

        # Parse k-mers
        kmers = parse_segmented_kmers(segmented_kmers)

        # KEEP ALL K-MERS (including duplicates) for this drug
        drug_substructures[drug_id] = kmers  # Don't use set()!

        # Add to vocabulary (unique only)
        all_substructures.update(kmers)

    print(f"\nParsing complete!")

    # Create mappings
    node_to_idx = {sub: idx for idx, sub in enumerate(sorted(all_substructures))}
    drug_to_idx = {drug_id: idx for idx, drug_id in enumerate(df['Drug_ID'])}

    print(f"  Unique substructures (nodes): {len(node_to_idx)}")
    print(f"  Drugs (hyperedges): {len(drug_to_idx)}")

    # Build edge list (now with duplicates)
    print("Building hypergraph edge list...")
    edge_list = []

    for drug_id, substructures in drug_substructures.items():
        edge_idx = drug_to_idx[drug_id]
        # This will create multiple connections if same k-mer appears multiple times
        for substructure in substructures:
            node_idx = node_to_idx[substructure]
            edge_list.append([node_idx, edge_idx])

    edge_list_tensor = torch.tensor(edge_list, dtype=torch.long)
    print(f"Total connections created: {len(edge_list)}")


    # Save files
    output_file = os.path.join(save_dir, f'hyG_drug_{len(df)}_kmer_{k}.pt')
    torch.save(edge_list_tensor, output_file)
    print(f"\nHypergraph saved to: {output_file}")

    metadata = {
        'num_drugs': len(drug_to_idx),
        'num_substructures': len(node_to_idx),
        'num_connections': len(edge_list),
        'k': k,
        'drug_to_idx': drug_to_idx,
        'node_to_idx': node_to_idx
    }

    metadata_file = os.path.join(save_dir, f'hyG_drug_{len(df)}_kmer_{k}_metadata.pt')
    torch.save(metadata, metadata_file)

    stats = {
        'num_nodes': len(node_to_idx),
        'num_edges': len(drug_to_idx),
        'num_connections': len(edge_list),
        'density': len(edge_list) / (len(node_to_idx) * len(drug_to_idx)),
        'avg_substructures_per_drug': len(edge_list) / len(drug_to_idx),
        'avg_drugs_per_substructure': len(edge_list) / len(node_to_idx)
    }

    print("\nHypergraph Statistics:")
    for key, value in stats.items():
        if 'num' in key:
            print(f"  {key}: {value:,}")
        else:
            print(f"  {key}: {value:.4f}")

    return edge_list_tensor, node_to_idx, drug_to_idx, stats

In [None]:

def verify_hypergraph(file_path: str):
    """Verify saved hypergraph matches expected structure."""
    print(f"\n{'='*60}")
    print(f"Verifying: {file_path}")
    print(f"{'='*60}")

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return None

    data = torch.load(file_path)

    print(f"Shape: {data.shape}")
    print(f"First 10 rows:\n{data[:10]}")
    print(f"\nUnique nodes: {len(torch.unique(data[:, 0]))}")
    print(f"Unique drugs: {len(torch.unique(data[:, 1]))}")
    print(f"Total connections: {data.shape[0]}")

    # Load metadata if available
    metadata_path = file_path.replace('.pt', '_metadata.pt')
    if os.path.exists(metadata_path):
        metadata = torch.load(metadata_path)
        print(f"\nMetadata:")
        print(f"  k-mer size: {metadata.get('k')}")
        print(f"  Avg k-mers/drug: {metadata.get('num_connections', 0) / metadata.get('num_drugs', 1):.2f}")

    return data


In [None]:

if __name__ == "__main__":
    # Build hypergraph from pre-segmented k-mer file
    edge_list, node_vocab, drug_vocab, stats = build_drugbank_hypergraph(
        csv_path='/content/drive/MyDrive/MLHygnn/DB/kmer_results_simple1709drugs/drugbank_kmers_k12.csv',  # We use the semicolon-separated file
        k=12
    )

    # Verify output
    verify_hypergraph('/content/drive/MyDrive/MLHygnn/DB/hypergraphs/hyG_drug_1709_kmer_12.pt')


Building Hypergraph for DrugBank with k=12
Loading data from: /content/drive/MyDrive/MLHygnn/DB/kmer_results_simple1709drugs/drugbank_kmers_k12.csv
Loaded 1709 drugs
Using k-mer column: 'Segmented_SMILE'
Parsing pre-segmented k-mers...
  Processing drug 1701/1709...
Parsing complete!
  Unique substructures (nodes): 43655
  Drugs (hyperedges): 1709
Building hypergraph edge list...
Total connections created: 91615

Hypergraph saved to: /content/drive/MyDrive/MLHygnn/DB/hypergraphs/hyG_drug_1709_kmer_12.pt

Hypergraph Statistics:
  num_nodes: 43,655
  num_edges: 1,709
  num_connections: 91,615
  density: 0.0012
  avg_substructures_per_drug: 53.6074
  avg_drugs_per_substructure: 2.0986

Verifying: /content/drive/MyDrive/MLHygnn/DB/hypergraphs/hyG_drug_1709_kmer_12.pt
Shape: torch.Size([91615, 2])
First 10 rows:
tensor([[30933,     0],
        [32283,     0],
        [40042,     0],
        [27117,     0],
        [17936,     0],
        [33099,     0],
        [41421,     0],
        [ 18