In [6]:
import pandas as pd

df = pd.read_table('input.tsv') # read input file
df = df.dropna() # remove rows with empty values
df.head()

Unnamed: 0,Entry,Entry Name,Protein names,Organism,Gene Ontology (biological process),Gene Names (primary)
0,A0A087X1C5,CP2D7_HUMAN,Putative cytochrome P450 2D7 (EC 1.14.14.1),Homo sapiens (Human),arachidonic acid metabolic process [GO:0019369...,CYP2D7
1,A0A0B4J2F0,PIOS1_HUMAN,Protein PIGBOS1 (PIGB opposite strand protein 1),Homo sapiens (Human),regulation of endoplasmic reticulum unfolded p...,PIGBOS1
2,A0A0B4J2F2,SIK1B_HUMAN,Putative serine/threonine-protein kinase SIK1B...,Homo sapiens (Human),cellular response to glucose starvation [GO:00...,SIK1B
3,A0A0C5B5G6,MOTSC_HUMAN,Mitochondrial-derived peptide MOTS-c (Mitochon...,Homo sapiens (Human),activation of protein kinase activity [GO:0032...,MT-RNR1
4,A0A0K2S4Q6,CD3CH_HUMAN,Protein CD300H (CD300 antigen-like family memb...,Homo sapiens (Human),neutrophil chemotaxis [GO:0030593],CD300H


In [7]:
df.describe()

Unnamed: 0,Entry,Entry Name,Protein names,Organism,Gene Ontology (biological process),Gene Names (primary)
count,17219,17219,17219,17219,17219,17219
unique,17219,17219,17219,1,13890,17163
top,A0A087X1C5,CP2D7_HUMAN,Putative cytochrome P450 2D7 (EC 1.14.14.1),Homo sapiens (Human),regulation of transcription by RNA polymerase ...,ERVK-8
freq,1,1,1,17219,368,4


In [8]:
def extract_biological_processes(df):
    '''
    Extracts the biological processes from the input dataset

    Parameters:
        df (pandas.Dataframe): The dataframe that the biological processes are extracted from

    Returns:
        biological_processes (list): The list of unique biological processes within the dataframe
    '''

    biological_processes_raw = df['Gene Ontology (biological process)'].astype(str)

    biologial_processes = []

    for process in biological_processes_raw:
        p = process.split("; ")
        biologial_processes += p

    biologial_processes = list(set(biologial_processes))

    if "nan" in biologial_processes:
        biologial_processes.remove("nan")

    return biologial_processes

In [9]:
def assign_genes_to_biological_processes(df, biological_processes):
    '''
    Assigns the corresponding list of of genes to the biological processes

    Parameters:
        df (pandas.Dataframe): The dataframe that the genes are extracted from
        biological_processes (list): The list of unique biological processes within the dataframe

    Returns:
        pairs (dict): The hashmap containing the biological processes as keys and a list of genes as values
    '''

    pairs = {}
    reduced_df = df[["Gene Ontology (biological process)","Gene Names (primary)"]].astype(str)

    for p in biological_processes:
        pairs[p] = []

        for _, row in reduced_df.iterrows():
            if p in row['Gene Ontology (biological process)']:
                gene_names = row['Gene Names (primary)'].split("; ")

                for gene_name in gene_names:
                    if gene_name not in pairs[p]:
                            pairs[p].append(gene_name)  

    return pairs


In [10]:
biological_processes = extract_biological_processes(df)
pairs = assign_genes_to_biological_processes(df, biological_processes)

In [11]:
def remove_irrelevant_biological_processes(biological_processes, pairs):
    '''
    Removes biologcal processes where the number of corresponding genes is lower than 4 or higher than 100

    Parameters:
        biological_processes (list): The list of unique biological processes within the dataframe
        pairs (dict): The hashmap containing the biological processes as keys and a list of genes as values

    Returns:
        df_reduces (pandas.Dataframe): The dataframe containing the the biological processes with genes between 4 and 100 and their corresponding genes
    '''

    for p in biological_processes:
        if p in pairs and ((len(pairs[p]) < 4) or (len(pairs[p]) > 100)):
            pairs.pop(p)

    df_reduced = pd.DataFrame.from_dict(pairs, orient='index')   
    
    return df_reduced

In [12]:
df_reduced = remove_irrelevant_biological_processes(biological_processes, pairs)
df_reduced.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,89,90,91,92,93,94,95,96,97,98
cellular triglyceride homeostasis [GO:0035356],XBP1,FITM2,SIRT1,DGAT2,NR1H4,FUNDC2,C1QTNF3,,,,...,,,,,,,,,,
urea transport [GO:0015840],UPK3A,AQP8,SLC14A1,SLC14A2,,,,,,,...,,,,,,,,,,
response to tetrachloromethane [GO:1904772],IGF2R,NQO1,CPT1A,EZH2,,,,,,,...,,,,,,,,,,
positive regulation of protein kinase C signaling [GO:0090037],PLA2G6,WNT11,VEGFA,CD40,ADRA1A,FLT4,WNT5A,MC1R,ADGRV1,SPHK2,...,,,,,,,,,,
regulation of mitotic spindle assembly [GO:1901673],CHMP2A,VPS4B,HSPA1A,HSPA1B,TPR,PLK1,HNRNPU,EML3,CCSAP,CHMP1B,...,,,,,,,,,,


In [13]:
df_reduced.to_excel("input_preprocessed.xlsx")