In [1]:
# --- Imports ---
import pandas as pd
import igraph as ig
import pyranges as pr

In [2]:
def process_nw(fname: str) -> pd.DataFrame:
    # Load .nw
    df = pd.read_csv(fname, sep = '\t')

    # Create enhancerID
    df['EID'] = (df[['chrom', 'chromStart', 'chromEnd']]
                       .astype(str).agg('_'.join, axis = 1).rank(method = 'dense').astype(int).map(lambda x: f"E{x:06d}"))

    return df

In [3]:
# Load E-Gs
nw = process_nw('hiPSC/hiPSC.network.tsv')

print(nw.shape)
nw.head(n = 5)

(67568, 10)


Unnamed: 0,chrom,chromStart,chromEnd,name,class,targetGene,targetEnsemblID,cellType,MPRabc,EID
0,chr1,9823,10674,intergenic|chr1:9823-10674,intergenic,FAM138F,ENSG00000282591,hiPSC,0.900603,E022596
1,chr1,9823,10674,intergenic|chr1:9823-10674,intergenic,FAM138A,ENSG00000237613,hiPSC,0.900603,E022596
2,chr1,10690,11574,intergenic|chr1:10690-11574,intergenic,FAM138F,ENSG00000282591,hiPSC,0.472076,E018195
3,chr1,10690,11574,intergenic|chr1:10690-11574,intergenic,FAM138A,ENSG00000237613,hiPSC,0.472076,E018195
4,chr1,28550,29598,intergenic|chr1:28550-29598,intergenic,FAM138F,ENSG00000282591,hiPSC,0.936529,E021276


In [4]:
# Function: label substructures of E-G interactions
def label_clusters(cluster: list) -> str:
    if len(cluster) < 2:
        return 'N/A'

    if len(cluster) == 2:
        return '1NR'
    
    enhancer_count = 0
    gene_count = 0
    for node in cluster:
        if node.startswith("E") and node[-6:].isdigit():
            enhancer_count += 1
        else:
            gene_count += 1

    if enhancer_count == len(cluster) - 1:
        return '1R'
    elif gene_count == len(cluster) - 1:
        return '2NR'

    return '2R'

In [5]:
# Function: calculate the distribution of substructures in a network 
def cluster_distribution(clusters: dict):
    distribution = {'1NR': 0, '1R': 0, '2NR': 0, '2R': 0, 'N/A': 0}
    total_clusters = 0
    
    for cluster in clusters.values():
        label = label_clusters(cluster)
        distribution[label] += 1
        total_clusters += 1

    if total_clusters == 0:
        print('No clusters to summarize.')
        return
    
    for label, count in distribution.items():
        print(f"{label}: {count / total_clusters * 100:.2f}%")

In [6]:
# Function: cluster network
def cluster_network(df: pd.DataFrame) -> dict:
    df = df.loc[:, ['EID', 'targetGene']]
    df = df.drop_duplicates()
    
    tuples = [tuple(x) for x in df.values]
    g = ig.Graph.TupleList(tuples, directed = False)
    node_labels = g.vs['name']
    
    #communities = g.community_edge_betweenness().as_clustering() <---- INEFFICIENT 
    communities = g.community_multilevel()
    
    communities_dict = {}
    
    for i, c in enumerate(communities):
        for node_index in c:
            communities_dict[node_labels[node_index]] = i
            
    clusters = {}
    for key, value in communities_dict.items():
        clusters.setdefault(value, []).append(key)
        
    return clusters

In [7]:
# Distribution
cluster_distribution(cluster_network(nw))

1NR: 44.08%
1R: 30.31%
2NR: 8.42%
2R: 17.19%
N/A: 0.00%


In [8]:
nw

Unnamed: 0,chrom,chromStart,chromEnd,name,class,targetGene,targetEnsemblID,cellType,MPRabc,EID
0,chr1,9823,10674,intergenic|chr1:9823-10674,intergenic,FAM138F,ENSG00000282591,hiPSC,0.900603,E022596
1,chr1,9823,10674,intergenic|chr1:9823-10674,intergenic,FAM138A,ENSG00000237613,hiPSC,0.900603,E022596
2,chr1,10690,11574,intergenic|chr1:10690-11574,intergenic,FAM138F,ENSG00000282591,hiPSC,0.472076,E018195
3,chr1,10690,11574,intergenic|chr1:10690-11574,intergenic,FAM138A,ENSG00000237613,hiPSC,0.472076,E018195
4,chr1,28550,29598,intergenic|chr1:28550-29598,intergenic,FAM138F,ENSG00000282591,hiPSC,0.936529,E021276
...,...,...,...,...,...,...,...,...,...,...
67563,chrX,156030253,156030753,intergenic|chrX:156030253-156030753,intergenic,MPP1,ENSG00000130830,hiPSC,0.319013,E044296
67564,chrX,156030253,156030753,intergenic|chrX:156030253-156030753,intergenic,RAB39B,ENSG00000155961,hiPSC,0.457016,E044296
67565,chrX,156030253,156030753,intergenic|chrX:156030253-156030753,intergenic,TMLHE,ENSG00000185973,hiPSC,0.395534,E044296
67566,chrX,156030253,156030753,intergenic|chrX:156030253-156030753,intergenic,SPRY3,ENSG00000168939,hiPSC,0.685451,E044296


In [9]:
import pandas as pd

df = nw

# Define enhancer ID as a tuple or string
df["enhancer_id"] = df["chrom"] + ":" + df["chromStart"].astype(str) + "-" + df["chromEnd"].astype(str)

# Number of unique enhancers
num_unique_enhancers = df["enhancer_id"].nunique()

# Number of unique genes (use Ensembl IDs to avoid alias issues)
num_unique_genes = df["targetEnsemblID"].nunique()

# Number of unique enhancer–gene links
num_unique_links = df.drop_duplicates(subset=["enhancer_id", "targetEnsemblID"]).shape[0]

print("Unique enhancers:", num_unique_enhancers)
print("Unique genes:", num_unique_genes)
print("Unique enhancer–gene links:", num_unique_links)


Unique enhancers: 45294
Unique genes: 14436
Unique enhancer–gene links: 67536
