In [10]:
import pandas as pd
import numpy as np 
from sqlalchemy import create_engine
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mt
import os
from find_path import find_file
import mygene
import os 
import warnings
warnings.filterwarnings('ignore')
import os

In [13]:
# Generate input for Go term analysis in R

# load HuRI gene IDs (available on LuckLab MySQL):
johannas_genes = pd.read_csv(find_file('ens_dataframe.csv'))
johannas_genes = {'CLUSTER_1': johannas_genes['ens_id'].tolist()}

clusters_dictionary = johannas_genes.copy()
directory = 'cluster_analysis_in_R_v25'

# Create the background for the GO term analysis with HuRI genes as background using mygene
mg = mygene.MyGeneInfo()
for cluster_name in clusters_dictionary.keys():
    df = mg.getgenes(clusters_dictionary.get(cluster_name), fields="entrezgene", as_dataframe=True)
    if 'entrezgene' not in df.columns.tolist():
        df['entrezgene'] = np.nan
    entrez_ids = np.array(df['entrezgene'].tolist())
    filename = cluster_name.split('_')[1] + cluster_name.split('_')[0] + '.npy'
    np.save(f'../{directory}/' + filename, entrez_ids, allow_pickle=True)

# set whole HuRI as the background
huri = pd.read_csv(find_file('hi_union.csv'))
huri_nodes = list(set(huri['ensembl_gene_id_a'].tolist() + huri['ensembl_gene_id_b'].tolist()))
df = mg.getgenes(huri_nodes, fields="entrezgene", as_dataframe=True)
entrez_ids = np.array(df['entrezgene'].tolist())
filename = '0CLUSTER.npy'
np.save(f'../{directory}/' + filename, entrez_ids, allow_pickle=True)

clusters_dictionary['CLUSTER_0'] = huri_nodes

# Switch to R-script for enrichment analysis

querying 1-1000...done.
querying 1001-1132...done.
querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-9094...done.


In [16]:
os.chdir(f'../{directory}/')

In [17]:
# Preprocess Go term analysis output and preprocess

# Load standard background and HuRI background dataframes:

empty_files = []
files = [i for i in os.listdir() if 'background.csv' in i and 'CLUSTER' in i]
for index, file in enumerate(files):
    if (index == 0) & (len(df) == 0):
        files.append(file)
    else:
        df = pd.read_csv(file, sep=' ', header=0)
        new_file = file.split('.')[0] + '_processed_for_plotting.csv'
        if not len(df) == 0:
            df['category'] = file[0:2]
            df.rename(columns={'Description':'term', 'Count':'count', 'geneID':'genes', 'p.adjust':'adj_pval'}, inplace=True)
            df.reset_index(inplace=True, drop=True)
            # I want to save the columns list, to use it for the empty dataframes
            col_list = df.columns.tolist()
            df.to_csv(new_file, sep='\t', header=True, index=False)
        else:
            empty_files.append(file)
            col_list = df.columns.tolist()
            df = pd.DataFrame(columns=col_list)
            df.to_csv(new_file, sep='\t', header=True, index=False)

# Create one dataframe for each cluster, which contains all 4 analyses for general background, and another one for the HuRI background

# for the general background
for cluster in list(set([i.split('_')[1] for i in files])): # this is just a list of all clusters we have
    files_to_load = [i for i in [i for i in os.listdir() if 'general_background_processed_for_plotting.csv' in i and ('BP' in i or 'CC' in i or 'MF' in i or 'DO' in i)] if cluster in i.split('_')]
    df_list = []
    for file in files_to_load:
        df = pd.read_csv(file, sep='\t', header=0)
        df_list.append(df)
    if len(df_list) != 0:
        merged_df = pd.concat(df_list)
        merged_df.to_csv(cluster + '_general_background_merged.csv', sep='\t', header=True, index=False)

# for the HuRI background:
for cluster in list(set([i.split('_')[1] for i in files])): # this is just a list of all clusters we have
    files_to_load = [i for i in [i for i in os.listdir() if 'huri_background_processed_for_plotting.csv' in i and ('BP' in i or 'CC' in i or 'MF' in i or 'DO' in i)] if cluster in i.split('_')]
    df_list = []
    for file in files_to_load:
        df = pd.read_csv(file, sep='\t', header=0)
        df_list.append(df)
    if len(df_list) != 0:
        merged_df = pd.concat(df_list)
        merged_df.to_csv(cluster + '_huri_background_merged.csv', sep='\t', header=True, index=False)