In [None]:
import pandas as pd
import numpy as np
import helper 
import os
import glob
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns
import gseapy as gp
import networkx as nx

pd.set_option('display.max_columns', 50) 

In [None]:
no_ppi = False
if no_ppi:
    ppi_data = pd.read_csv("/consensus.dat", sep="\t", header=None, names=["Gene1", "Gene2"])
    adj_genes = pd.concat([ppi_data["Gene1"], ppi_data["Gene2"]]).unique()
    adj_genes = sorted(adj_genes)
    adj_matrix = np.zeros((len(adj_genes), len(adj_genes)), dtype=int)

    gene_to_index_adj = {gene: idx for idx, gene in enumerate(adj_genes)}

    for _, row in ppi_data.iterrows():
        i, j = gene_to_index_adj[row["Gene1"]], gene_to_index_adj[row["Gene2"]]
        adj_matrix[i, j] = 1  
        adj_matrix[j, i] = 1  
        
    adj_df = pd.DataFrame(adj_matrix, index=adj_genes, columns=adj_genes)
    adj_df.index = adj_df.index.astype(str)
    adj_df.columns = adj_df.columns.astype(str)

    adj_df.to_csv(
        "data/embeddings/raw/PPI-RAW_UNIPROT_HUMAN/PPI-RAW_emb.csv", 
        header=False, 
        index=False
    )

    with open("data/embeddings/raw/PPI-RAW_UNIPROT_HUMAN/PPI-RAW_genelist.txt", "w") as f:
        f.write("\n".join(adj_df.index))

In [None]:
file_names = [] #list of raw embedding files to be processed, gene names as rows, embedding dimensions as columns, adjust scopes as needed 

processed_dfs, entrez_sets = helper.process_embedding_files(file_names, scopes = 'uniprot', base_path='data/embeddings/original')

In [None]:
base_dir = 'data/embeddings/raw'
os.makedirs(base_dir, exist_ok=True)

# save embeddings in a consistent format
for file, df in processed_dfs.items():
    print(file)
    filename = os.path.splitext(os.path.basename(file))[0]

    folder_path = os.path.join(base_dir, filename)
    os.makedirs(folder_path, exist_ok=True)

    gene_id_list = df['entrezgene'].tolist()
    print(len(gene_id_list))
    gene_list_file = os.path.join(folder_path, filename + 'genelist.txt')
    with open(gene_list_file, 'w') as f:
        for gene_id in gene_id_list:
            f.write(f"{gene_id}\n")
    df_no_entrez = df.drop(columns=['entrezgene'])
    
    emb_file = os.path.join(folder_path, filename + 'emb.csv')
    df_no_entrez.to_csv(emb_file, header=None, index=False)
    
    print(f"Saved gene list and embedding files for {filename} in {folder_path}")

In [None]:
folder_path = 'data/embeddings/raw'
subfolders = [f.path for f in os.scandir(folder_path) if f.is_dir()]

gene_lists = {}
embeddings = {}

# load embeddings and gene lists
for subfolder in subfolders:
    print(f"Processing subfolder: {subfolder}")
    
    gene_txt_files = glob.glob(os.path.join(subfolder, '*.txt'))
    if not gene_txt_files:
        print(f"No txt file found in {subfolder}")
        continue
    gene_file = gene_txt_files[0]
    with open(gene_file, 'r') as f:
        genes = [line.strip() for line in f]
    gene_lists[subfolder] = genes

    csv_files = glob.glob(os.path.join(subfolder, '*.csv'))
    if not csv_files:
        print(f"No csv file found in {subfolder}")
        continue
    csv_file = csv_files[0]
    embedding = pd.read_csv(csv_file, header=None)
    
    embedding.index = genes
    
    embeddings[subfolder] = embedding

In [None]:
all_gene_sets = [set(gene_lists[sf]) for sf in gene_lists]
common_genes = set.intersection(*all_gene_sets)
print(f"Number of common genes across all sets: {len(common_genes)}")

common_genes_ordered = sorted(common_genes)

In [None]:
# get the intersection of genes and modify the embeddings to only contain those genes 
output_root = 'data/embeddings/intersect/'

for subfolder in subfolders:
    if subfolder not in embeddings or subfolder not in gene_lists:
        continue
    
    emb = embeddings[subfolder]
    
    emb_filtered = emb.loc[common_genes_ordered]
    
    genes_filtered = common_genes_ordered
    
    subfolder_name = os.path.basename(subfolder) 
    output_subfolder = os.path.join(output_root, subfolder_name)
    os.makedirs(output_subfolder, exist_ok=True)
    
    csv_output_path = os.path.join(output_subfolder, f"{subfolder_name}emb.csv")
    emb_filtered.to_csv(csv_output_path, header=False, index=False)
    
    txt_output_path = os.path.join(output_subfolder, f"{subfolder_name}genelist.txt")
    with open(txt_output_path, 'w') as f:
        for gene in genes_filtered:
            f.write(gene + "\n")
    
    print(f"Saved filtered embedding and gene list for {subfolder_name} to {output_subfolder}")

print("All filtered embeddings and gene lists saved.")