In [None]:
# Matrix Manipulation/Management Libraries
import pandas as pd
import numpy as np

# Data Visualization Libaries
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.stats import fisher_exact

# Bioinformatics Libraries
import pybedtools

# Statistical Tests, Machine Learning, etc. Libraries
import igraph as ig
from sklearn.preprocessing import MinMaxScaler

# Input/Output Libraries
import os
import subprocess

# Miscellaneous Libraries 
import time
from typing import List

In [None]:
def calculate_overlaps(df1, df2):
    # Merge on 'Chrom' to align intervals from the same chromosome
    df1 = df1[['Chrom', 'Start', 'End']]
    df1 = df1.drop_duplicates()
    df2 = df2[['Chrom', 'Start', 'End']]
    df2 = df2.drop_duplicates()
    
    merged_df = pd.merge(df1, df2, on='Chrom', suffixes=('_x', '_y'))
    
    # Convert 'Start' and 'End' to integers if they aren't already
    merged_df['Start_x'] = merged_df['Start_x'].astype(int)
    merged_df['End_x'] = merged_df['End_x'].astype(int)
    merged_df['Start_y'] = merged_df['Start_y'].astype(int)
    merged_df['End_y'] = merged_df['End_y'].astype(int)
    
    # Find intersections
    intersections = merged_df[
        (merged_df['Start_x'] <= merged_df['End_y']) &
        (merged_df['End_x'] >= merged_df['Start_y'])
    ]
    
    # Return the count of unique overlapping intervals
    return intersections.drop_duplicates().shape[0]

def permutation_test_for_enrichment(unfiltered_edges_path, filtered_edges_path, mpra_dataset, permutations=1000):
    # Load the datasets
    unfiltered_df = pd.read_csv(unfiltered_edges_path, delimiter='\t')
    filtered_df = pd.read_csv(filtered_edges_path, delimiter='\t')
    mpra_df = pd.read_csv(mpra_dataset, delimiter='\t')  # Assuming similar structure/format
    
    # Calculate actual overlaps with filtered network
    actual_overlaps = calculate_overlaps(filtered_df, mpra_df)
    
    # Get the number of enhancers in the filtered network
    filtered_df = filtered_df[['Chrom', 'Start', 'End']]
    filtered_df = filtered_df.drop_duplicates()
    print(filtered_df)
    number_of_filtered_enhancers = filtered_df.shape[0]
    print(number_of_filtered_enhancers)
    
    # Permutation test
    permutation_overlaps = []
    for _ in range(permutations):
        # Sample enhancers from the unfiltered network
        sampled_enhancers = unfiltered_df.sample(n=number_of_filtered_enhancers, replace=False)
        
        # Calculate overlaps with MPRA dataset
        overlaps = calculate_overlaps(sampled_enhancers, mpra_df)
        permutation_overlaps.append(overlaps)
    
    # Calculate empirical p-value
    p_value = np.mean(np.array(permutation_overlaps) >= actual_overlaps)
    
    # Calculate the average number of hits per permuted network
    average_hits_per_permutation = np.mean(permutation_overlaps)
    
    return actual_overlaps, p_value, average_hits_per_permutation

actual_overlaps, p_value, average_hits_per_permutation = permutation_test_for_enrichment(
    'Network/Components/ABC_Network_Unfiltered_Edges.tsv',
    'Network/ABC_Network_Filtered_Edges.tsv',
    'MPRA/Processed_Pertubation_MPRA.tsv',
    permutations=1  # Adjust the number of permutations as needed
)
print(f'Hits in the true (filtered) network: {actual_overlaps}')
print(f'Empirical p-value: {p_value}')
print(f'Average hits per permuted network: {average_hits_per_permutation}')

In [None]:
# Label Substructures of Enhancer-Promoter Interactions

def label_clusters(cluster: list):
    if len(cluster) == 2:
        return "Enhancer --> Gene (1:1)"
    
    elif len(cluster) > 2:
        module_count = 0
        gene_count = 0
        for item in cluster:
            if item.startswith("Module_"):
                module_count += 1
            else:
                gene_count += 1
        if module_count == len(cluster) - 1:
            return "Enhancers --> Gene (2+:1)" 
        elif gene_count == len(cluster) - 1:
            return "Enhancer --> Genes (1:2+)"
    
    return "Enhancers --> Genes (2+:2+)"

In [None]:
# Clustering Approach 

def cluster_betweenness(file_name: str):
    
    df = pd.read_csv(file_name, sep = '\t')
    df = df.loc[:, ["Module", "Gene"]]
    df = df.drop_duplicates()
    
    tuples = [tuple(x) for x in df.values]
    g = ig.Graph.TupleList(tuples, directed = True)
    node_labels = g.vs['name']
    
    communities = g.community_edge_betweenness().as_clustering()
    
    communities_dict = {}
    
    for i, c in enumerate(communities):
        for node_index in c:
            communities_dict[node_labels[node_index]] = i
            
    clusteres = {}
    for key, value in communities_dict.items():
        clusteres.setdefault(value, []).append(key)
        
    return clusteres

In [None]:
import pandas as pd

enhancer_regions_bed_path = 'Network/ABC_Network_Filtered_Regions.bed'

enhancer_regions_df = pd.read_csv(enhancer_regions_bed_path, sep='\t', header=None, names=['Chrom', 'Start', 'End'])

variants_bed_path = 'Stephan_QuarterMil_denovoSNPs_posAllelePheno.bed'
variants_df = pd.read_csv(variants_bed_path, sep='\t', header=None, names=['Chrom', 'Start', 'End'])

network_file_path = 'Network/ABC_Network_Filtered_Edges.tsv'
network_df = pd.read_csv(network_file_path, sep='\t')

def check_variant_presence(row, variants_df):
    enhancer_variants = variants_df[(variants_df['Chrom'] == row['Chrom']) &
                                    (variants_df['Start'] >= row['Start']) &
                                    (variants_df['End'] <= row['End'])]
    return not enhancer_variants.empty

network_df['Has_Variant'] = network_df.apply(check_variant_presence, variants_df=variants_df, axis=1)

unique_enhancers_with_variants = network_df[network_df['Has_Variant']].drop_duplicates(subset=['Chrom', 'Start', 'End'])
print(f"Number of unique enhancers with variants: {len(unique_enhancers_with_variants)}")

network_df

In [None]:
def cluster_distribution_with_variants(clusters: dict, network_data_with_variants: pd.DataFrame):
    
    distribution = {
        "Enhancer --> Gene (1:1)": {"count": 0, "with_variants": 0},
        "Enhancers --> Gene (2+:1)": {"count": 0, "with_variants": 0},
        "Enhancer --> Genes (1:2+)": {"count": 0, "with_variants": 0},
        "Enhancers --> Genes (2+:2+)": {"count": 0, "with_variants": 0}
    }
    
    for cluster_id, items in clusters.items():
        label = label_clusters(items)
        distribution[label]["count"] += 1
        
        has_variant = any(network_data_with_variants[network_data_with_variants['Module'].isin(items)]['Has_Variant'])
        if has_variant:
            distribution[label]["with_variants"] += 1
    
    for label, stats in distribution.items():
        percent_with_variants = (stats["with_variants"] / stats["count"]) * 100 if stats["count"] else 0
        print(f"{label}:")
        print(f"  Total Clusters: {stats['count']}")
        print(f"  Clusters with Variants: {stats['with_variants']} ({percent_with_variants:.2f}%)")

clusters = cluster_betweenness('/home/wbd20/Miscellaneous/Kreimer_Lab/Network/ABC_Network_Filtered_Edges.tsv')
cluster_distribution_with_variants(clusters, network_df)

In [None]:
asd_genes_path = 'E-P-INs/Variants/ASD_Genes.tsv'
asd_genes_df = pd.read_csv(asd_genes_path, sep='\t')

asd_genes_set = set(asd_genes_df['Gene'])

network_df['Has_Gene'] = network_df['Gene'].apply(lambda gene: gene in asd_genes_set)

unique_asd_genes = network_df[network_df['Has_Gene']].drop_duplicates(subset=['Chrom', 'Start', 'End'])
print(f"Number of unique ASD-associated genes: {len(unique_asd_genes)}")

network_df_backup = network_df.copy()

In [None]:
network_df = network_df.drop('Chrom', axis = 1)
network_df = network_df.drop('Start', axis = 1)
network_df = network_df.drop('End', axis = 1)
network_df = network_df.drop('Sample', axis = 1)
network_df = network_df.drop_duplicates()
network_df

In [None]:
import numpy as np
import pandas as pd

actual_true_pairings = ((network_df['Has_Variant'] == True) & (network_df['Has_Gene'] == True)).sum()

n_permutations = 1000
count_true_pairings_permutations = []

for _ in range(n_permutations):
    shuffled_has_variant = network_df['Has_Gene'].sample(frac=1, replace=False).reset_index(drop=True)
    
    count = ((shuffled_has_variant == True) & (network_df['Has_Variant'] == True)).sum()
    count_true_pairings_permutations.append(count)

average_hits_permutations = np.mean(count_true_pairings_permutations)

p_value = np.mean([count >= actual_true_pairings for count in count_true_pairings_permutations])

print(f"Actual number of true hits: {actual_true_pairings}")
print(f"Average number of hits in permutations: {average_hits_permutations}")
print(f"Empirical p-value: {p_value}")