In [None]:

import os
import urllib.request
from bs4 import BeautifulSoup

# FTP URL
url = "http://ftp.ebi.ac.uk/pub/databases/opentargets/platform/25.03/output/association_overall_direct/"
save_dir = "openTargets"
os.makedirs(save_dir, exist_ok=True)

# Parse HTML for .parquet links
with urllib.request.urlopen(url) as response:
    soup = BeautifulSoup(response.read(), "html.parser")

# Download each .parquet file
for link in soup.find_all("a"):
    href = link.get("href")
    if href.endswith(".parquet"):
        full_url = url + href
        save_path = os.path.join(save_dir, href)
        print(f"Downloading {href}...")
        urllib.request.urlretrieve(full_url, save_path)


### GRAPH THING => 
### Create a graph of nodes = genes 
### edges = the euclidean distance of the gene's disease vector

In [None]:
import os
import pandas as pd

# Set the path to your openTargets folder
folder_path = "./openTargets"

# List all files in the folder
parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]

# Initialize an empty list to hold individual DataFrames
df_list = []

# Read each Parquet file and append the resulting DataFrame to the list
for parquet_file in parquet_files:
    file_path = os.path.join(folder_path, parquet_file)
    df = pd.read_parquet(file_path)  # Read the Parquet file
    df_list.append(df)

# Concatenate all DataFrames into one
openTargets_df = pd.concat(df_list, ignore_index=True)

In [None]:
print(len(openTargets_df))
openTargets_df.head(30)  # Preview the first few rows

In [30]:
# Save the combined DataFrame to a CSV file
openTargets_df.to_csv("openTargets.tsv", sep="\t", index=False)

In [None]:
import pronto

# Load the ontology
ontology = pronto.Ontology("http://purl.obolibrary.org/obo/doid.obo")
# Example: map DOIDs to names
doid_to_name = {term.id: term.name for term in ontology.terms() if term.id.startswith("DOID")}
print(doid_to_name["DOID:0050890"]) 

In [None]:
import pandas as pd
df = pd.read_csv("./Ensembl/Homo_sapiens.GRCh38.113.gtf.gz", sep='\t', comment='#', header=None)

In [None]:
import pandas as pd
from tqdm import tqdm

# Assuming df is already loaded
# Define a function to extract the gene_id and gene_name from the attribute column
def extract_gene_info(attributes):
    # Split by semicolon and strip whitespace
    attributes = attributes.split(';')
    
    # Initialize the gene_id and gene_name as None
    gene_id = None
    gene_name = None
    
    # Loop through the key-value pairs and extract gene_id and gene_name
    for attribute in attributes:
        if 'gene_id' in attribute:
            gene_id = attribute.split('"')[1]  # Extract value between quotes
        elif 'gene_name' in attribute:
            gene_name = attribute.split('"')[1]  # Extract value between quotes
    
    return pd.Series([gene_id, gene_name])

# Use tqdm to apply the function with a progress bar
tqdm.pandas(desc="Extracting gene info")

# Apply the function to the 'attribute' column with tqdm progress bar
gene_info_df = df[8].progress_apply(extract_gene_info)

# Set column names
gene_info_df.columns = ['gene_id', 'gene_name']

# Drop duplicates based on the 'gene_id' and 'gene_name' pair
gene_info_df = gene_info_df.drop_duplicates()



In [None]:
# Display the new DataFrame
print(gene_info_df.head())
print(len(gene_info_df))

In [None]:
import pandas as pd

# Load MONDO nodes
mondo_df = pd.read_csv("./mondo/mondo_nodes.tsv", sep="\t")

# Drop rows without xrefs
mondo_df = mondo_df.dropna(subset=["xref"])

xref_to_name = {}

# Iterate over rows to extract all xrefs and map them to MONDO name
for _, row in mondo_df.iterrows():
    name = row['name']
    xrefs = row['xref'].split('|')
    for xref in xrefs:
        xref_to_name[xref] = name

# OPTIONAL: include MONDO IDs too
for _, row in mondo_df.iterrows():
    mondo_id = row['id']
    name = row['name']
    xref_to_name[mondo_id] = name

# Now you can use it like this
openTargets_df['diseaseCommonName'] = openTargets_df['diseaseId'].str.replace("_", ":", regex=False).map(xref_to_name)


In [None]:
gene_id_to_name = dict(zip(gene_info_df['gene_id'], gene_info_df['gene_name']))

# Step 2: Vectorized operations (no swifter needed anymore!)
openTargets_df['geneCommonName'] = openTargets_df['targetId'].map(gene_id_to_name)


In [None]:
print(openTargets_df.head(20))
print(len(openTargets_df))
print(openTargets_df["geneCommonName"].unique())
print(openTargets_df["geneCommonName"].unique().shape)
print(openTargets_df["diseaseCommonName"].unique())
print(openTargets_df["diseaseCommonName"].unique().shape)
missing_disease_rows = openTargets_df[openTargets_df['diseaseCommonName'].isna()]
print(missing_disease_rows.head())
print("Number of missing disease names:", missing_disease_rows.shape[0])
print(openTargets_df['diseaseId'].str.startswith("EFO_").sum())


In [None]:
openTargets_df = openTargets_df.dropna(subset=["diseaseCommonName", "geneCommonName"])
len(openTargets_df)

In [None]:
df_score_as_value = openTargets_df.pivot_table(
    index='geneCommonName',
    columns='diseaseCommonName',
    values='score',
    aggfunc='first'  # or 'mean'/'max' if multiple scores exist for same (gene, disease)
)
print(df_score_as_value.columns)

In [None]:
print(type(df_score_as_value))

In [None]:
df_score_as_value.to_csv("./df_score_as_value.tsv", sep="\t")

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from scipy.spatial.distance import pdist, squareform
from tqdm import tqdm

print("🔹 Loading dataframe...")
df = pd.read_csv("./df_score_as_value.tsv", sep="\t", index_col=0)

print("🔹 Extracting genes and score matrix...")
genes = df.iloc[:, 0].to_numpy()
score_matrix = df.iloc[:, 1:].fillna(0).to_numpy()

print(f"✅ Extracted {len(genes)} genes and score matrix of shape {score_matrix.shape}")

print("🔹 Computing pairwise Cosine distances...")
distances = pdist(score_matrix, metric='cosine')
distance_matrix = squareform(distances)

print("✅ Distance matrix computed.")
np.save("distance_matrix.npy", distance_matrix)

print("🔹 Building graph with nodes...")
G = nx.Graph()
G.add_nodes_from(genes)
print("✅ Nodes added to graph.")

print("🔹 Preparing edge list...")
i_upper, j_upper = np.triu_indices(len(genes), k=1)

edges = []
for i, j in tqdm(zip(i_upper, j_upper), total=len(i_upper), desc="Adding edges"):
    edges.append((genes[i], genes[j], {'weight': distance_matrix[i, j]}))

print(f"✅ Prepared {len(edges)} edges.")

print("🔹 Adding edges to the graph...")
G.add_edges_from(edges)
print("✅ Graph construction complete.")

print(f"📊 Final graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")


In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from scipy.spatial.distance import pdist, squareform
from tqdm import tqdm

print("🔹 Loading dataframe...")
df = pd.read_csv("./df_score_as_value.tsv", sep="\t", index_col=0)

In [None]:
print(df["bipolar disorder"])

In [None]:
print("🔹 Extracting genes and score matrix...")
genes = df.index.to_numpy()
print(genes)
score_matrix = df.iloc[:, 1:].fillna(0).to_numpy()
print(score_matrix)

In [None]:

print("🔹 Computing pairwise Cosine distances...")
# distances = pdist(score_matrix, metric='cosine')
# sim_matrix = squareform(distances)
sim_matrix = np.load("distance_matrix.npy")
print("🔹 Building graph with nodes...")
G = nx.Graph()
G.add_nodes_from(genes)
print("✅ Nodes added to graph.")
print(list(G.nodes)[:10])

In [None]:
# Keep top-K per row
K = 50  # Adjust as needed
edges = []
print("🔹 Selecting top-K similar genes per gene...")
for i in tqdm(range(sim_matrix.shape[0])):
    top_k_idx = np.argpartition(sim_matrix[i], -K)[-K:]
    for j in top_k_idx:
        if i != j:
            weight = sim_matrix[i, j]
            edges.append((genes[i], genes[j], {'weight': weight}))
print(len(edges))
# Build the graph
print("🔹 Constructing graph...")
G = nx.Graph()
G.add_nodes_from(genes)
G.add_edges_from(edges)

print("✅ Graph construction complete.")
print(f"📊 Final graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")

In [None]:
nx.write_weighted_edgelist(G, "50_gene_distance_graph.edgelist")

In [None]:
print(G)
print("Sample nodes:", list(G.nodes)[:10])
print("Number of NaN nodes:", sum(1 for n in G.nodes if str(n) == 'nan' or (isinstance(n, float) and math.isnan(n))))


In [None]:
import community as community_louvain  # python-louvain
import networkx as nx
G = nx.read_edgelist("./50_gene_distance_graph.edgelist", data=(("weight", float),))
# Run Louvain community detection
partition = community_louvain.best_partition(G, weight='weight', resolution=2.0)  # returns: {gene: community_id}


#  Calculate centrality for nodes
degree_centrality = nx.degree_centrality(G)

# Group nodes by community
communities = {}
for node, comm_id in partition.items():
    if comm_id not in communities:
        communities[comm_id] = []
    communities[comm_id].append(node)

# For each community, sort by degree centrality (you can use other centrality measures)
for comm_id, nodes in communities.items():
    central_nodes = sorted(nodes, key=lambda x: degree_centrality[x], reverse=True)
    print(f"Community {comm_id} top nodes: {central_nodes[:5]}")  # Top 5 nodes
# # Invert the partition to get community → list of genes
# from collections import defaultdict

# communities = defaultdict(list)
# for gene, community_id in partition.items():
#     communities[community_id].append(gene)

# # Sort by community ID and print
# sorted_communities = dict(sorted(communities.items()))

# for cid, members in sorted_communities.items():
#     print(f"Community {cid}: {members}")
    
# with open("communities_resolution2.0.txt", "w") as f:
#     for cid, members in sorted_communities.items():
#         f.write(f"Community {cid}: {', '.join(members)}\n")


In [None]:
# Calculate betweenness centrality considering edge weights
betweenness_centrality = nx.betweenness_centrality(G, weight='weight')

# Group nodes by community
for comm_id, nodes in communities.items():
    central_nodes = sorted(nodes, key=lambda x: betweenness_centrality[x], reverse=True)
    print(f"Community {comm_id} top nodes by betweenness: {central_nodes[:5]}")  # Top 5 nodes


## Other goal = given a disease, normalize their scores aka have their score = score / total genes then see how our genes compare