In [1]:
import pandas as pd
import numpy as np

genage = pd.read_csv("../../../model-organisms/genage_models.csv")
gene2go = pd.read_csv("../../../model-organisms/gene2go",delimiter='\t')

print("The files have been read!")

The files have been read!


In [2]:
genage.head(10)

Unnamed: 0,GenAge ID,symbol,name,organism,entrez gene id,avg lifespan change (max obsv),lifespan effect,longevity influence
0,1,aak-2,AMP-Activated Kinase,Caenorhabditis elegans,181727.0,,Increase and Decrease,Pro-Longevity
1,2,aat-8,Amino Acid Transporter,Caenorhabditis elegans,185079.0,30.0,Increase,Anti-Longevity
2,3,abu-11,Activated in Blocked Unfolded protein response,Caenorhabditis elegans,173404.0,28.0,Increase,Pro-Longevity
3,4,acl-11,ACyLtransferase-like,Caenorhabditis elegans,185044.0,,Increase,Anti-Longevity
4,5,aco-2,ACOnitase,Caenorhabditis elegans,176121.0,20.5,Increase,Anti-Longevity
5,6,Adcy5,adenylate cyclase 5,Mus musculus,224129.0,30.0,Increase,Anti-Longevity
6,7,age-1,"AGEing alteration AGE-1, abnormal DAuer Format...",Caenorhabditis elegans,174762.0,1000.0,Increase,Anti-Longevity
7,8,AGP1,high-Affinity Glutamine Permease,Saccharomyces cerevisiae,850333.0,,Increase,Anti-Longevity
8,9,ain-1,ALG-1 INteracting protein,Caenorhabditis elegans,181719.0,10.0,Decrease,Pro-Longevity
9,10,akt-1,AKT kinase family,Caenorhabditis elegans,179424.0,,Increase,Anti-Longevity


In [3]:
genage.columns

Index(['GenAge ID', 'symbol', 'name', 'organism', 'entrez gene id',
       'avg lifespan change (max obsv)', 'lifespan effect',
       'longevity influence'],
      dtype='object')

In [4]:
genage["organism"].value_counts()

organism
Saccharomyces cerevisiae     911
Caenorhabditis elegans       889
Drosophila melanogaster      202
Mus musculus                 136
Schizosaccharomyces pombe     61
Podospora anserina             3
Mesocricetus auratus           1
Danio rerio                    1
Caenorhabditis briggsae        1
Name: count, dtype: int64

In [5]:
genage["longevity influence"].value_counts()

longevity influence
Anti-Longevity           1101
Pro-Longevity             546
Necessary for fitness     497
Unannotated                34
Unclear                    27
Name: count, dtype: int64

In [6]:
sc = genage[genage["organism"] == "Saccharomyces cerevisiae"]  
ce = genage[genage["organism"] == "Caenorhabditis elegans"] 
dm = genage[genage["organism"] == "Drosophila melanogaster"] 
mm = genage[genage["organism"] == "Mus musculus"]
organisms = [sc, ce, dm, mm]

In [7]:
gene2go['Category'].value_counts()

Category
Function     35573853
Process      33153478
Component    25868886
Name: count, dtype: int64

In [8]:
# We have 7 possibilites of data generation
# Biological Process (BP) | Molecular Function (MF) | Celullar Component (CC) | and the combination of each one.
# Execute one at time, that process use a lot of memory, because the file is large.

# gene2go = gene2go[(gene2go['Category'] == 'Process')]
# gene2go = gene2go[(gene2go['Category'] == 'Function')]
# gene2go = gene2go[(gene2go['Category'] == 'Component')]
# gene2go = gene2go[(gene2go['Category'] == 'Process') | (gene2go['Category'] == 'Function')]
# gene2go = gene2go[(gene2go['Category'] == 'Process') | (gene2go['Category'] == 'Component')]
# gene2go = gene2go[(gene2go['Category'] == 'Function') | (gene2go['Category'] == 'Component')]
gene2go = gene2go[(gene2go['Category'] == 'Process') | (gene2go['Category'] == 'Function') | (gene2go['Category'] == 'Component')]


In [9]:
df = None
for organism in organisms:
    df = organism.copy(deep=True)
    nameOrganism = df["organism"].iloc[1] 
    #Lista dos GO
    goTerms = gene2go[gene2go["GeneID"].isin(df["entrez gene id"].tolist())]["GO_ID"].drop_duplicates()

    
    # Criando as colunas dos GO termos encontrados cruzando os conjuntos de dados GO X HAGR.
    dfs_to_concat = []
    for goTerm in goTerms:
        temp_df = pd.DataFrame(0, index=df.index, columns=[goTerm])
        dfs_to_concat.append(temp_df)    
    df = pd.concat([df] + dfs_to_concat, axis=1)
    
    # Definindo como 1 todos os GO termos que estão presentes para cada index da tabela pelo GeneID
    for index in df.index: 
        GeneID = df.loc[index,"entrez gene id"]
        GOs = gene2go[gene2go["GeneID"] == GeneID]["GO_ID"]
        for GO in GOs:
            df.loc[index,GO] = 1
    
    # Removendo colunas desnecessárias e definindo valores das classes Pro-Longevity (1) e Anti-Longevity (0).
    df = df.drop(["entrez gene id","GenAge ID","symbol","organism","name","avg lifespan change (max obsv)","lifespan effect"],axis=1)
    df["longevity influence"].replace("Anti-Longevity",0, inplace=True)
    df["longevity influence"].replace("Pro-Longevity",1, inplace=True)
    
    # Removendo valores de influência de longevidade não definidas.
    df = df.drop(index=df[df["longevity influence"] == "Necessary for fitness"].index.tolist())
    df = df.drop(index=df[df["longevity influence"] == "Unannotated"].index.tolist())
    df = df.drop(index=df[df["longevity influence"] == "Unclear"].index.tolist())

    # Carregando e recuperando a hierarquia dos genes em um .csv.
    nodesName = df.columns[1:]
    DAG = pd.DataFrame(0,columns=nodesName,index=nodesName)
    f = open("../../../model-organisms/go.obo")
    lines = f.readlines()
    i = 0
    for line in lines:
        i = i+1
        if(i % 10000 == 0):
            print(i)
        if(line.startswith("id: ")):
            y = line[4:14]
        elif(line.startswith("is_a: ")):
            x = line[6:16]
            try:
                if(DAG.loc[x,y] == 0): # Procurar os descendentes de um gene é necessário filtrar a coluna do gene que estamos querendo saber, todas as linhas que tiverem 1 são descendentes.
                    DAG.loc[x,y] = 1
            except:
                pass
      
    df.to_csv(f"{nameOrganism}/BPMFCC.csv")
    DAG.to_csv(f"{nameOrganism}/DAG-BPMFCC.csv")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["longevity influence"].replace("Anti-Longevity",0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["longevity influence"].replace("Pro-Longevity",1, inplace=True)


10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["longevity influence"].replace("Anti-Longevity",0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["longevity influence"].replace("Pro-Longevity",1, inplace=True)


10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["longevity influence"].replace("Anti-Longevity",0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["longevity influence"].replace("Pro-Longevity",1, inplace=True)


10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["longevity influence"].replace("Anti-Longevity",0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["longevity influence"].replace("Pro-Longevity",1, inplace=True)


10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
