In [2]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import pandas as pd
import arff
import os

organisms = ['Caenorhabditis elegans', 'Drosophila melanogaster', 'Mus musculus', 'Saccharomyces cerevisiae']
organismsMap = {'Caenorhabditis elegans': 'worm', 'Drosophila melanogaster': 'fly', 'Mus musculus': 'mouse', 'Saccharomyces cerevisiae': 'yeast'}
organismsMap2 = {'Caenorhabditis elegans': 'C. elegans', 'Drosophila melanogaster': 'D. melanogaster', 'Mus musculus': 'M. musculus', 'Saccharomyces cerevisiae': 'S. cerevisiae'}

# Definindo o número de folds
num_folds = 10
kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

# Função para salvar dados em arquivos ARFF
def salvar_arquivo_arff(nome_arquivo, dados, atributos, relation):
    with open(nome_arquivo, 'w') as f:
        arff.dump({
            'relation': relation,
            'attributes': [(col, 'REAL') for col in atributos],
            'data': dados
        }, f)

# Função para criar e salvar os folds
def create_folds_arff(X, y, relation, org):
    for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
        # Dividindo o fold em treino e teste
        X_train = df.iloc[train_index]
        y_train = y.iloc[train_index]
        
        train_df, val_df = train_test_split(X_train, test_size=0.1, random_state=42, shuffle=True, stratify=y_train)
        test_df = df.iloc[test_index]
        
        # Salvando os dados em arquivos ARFF           
        traName = f"{organismsMap[org]}-{path.split(".")[0]}_fold_{fold}_tra"
        valName = f"{organismsMap[org]}-{path.split(".")[0]}_fold_{fold}_val"
        tstName = f"{organismsMap[org]}-{path.split(".")[0]}_fold_{fold}_tst"
        
        salvar_arquivo_arff(f"{org}/folds/{path.split(".")[0]}/{traName}.arff", train_df.values, train_df.columns, relation=traName)
        salvar_arquivo_arff(f"{org}/folds/{path.split(".")[0]}/{valName}.arff", val_df.values, val_df.columns, relation=valName)
        salvar_arquivo_arff(f"{org}/folds/{path.split(".")[0]}/{tstName}.arff", test_df.values, test_df.columns, relation=tstName)
columns = ["#Inst", "#GO", "#ClasseP"]        
index = pd.MultiIndex.from_product([['D. melanogaster','M. musculus','C. elegans', 'S. cerevisiae'], ['BP', 'MF', 'CC', 'BP.MF', 'BP.CC', 'MF.CC', 'BP.MF.CC']])    
table_details = pd.DataFrame(index=index, columns=columns, dtype='string')
for organism in organisms:
    for path in os.listdir(f"{organism}"):
            allpath = f"{organism}/{path}"
            if os.path.isfile(allpath):
                df = pd.read_csv(allpath)  
                # Dividir os dados em conjunto de dados de treino e teste
                
                X = df.drop('longevity influence', axis=1)
                y = df['longevity influence']
                
                print("#" * 100 + "\n")
                print(allpath)
                print(f"Estatísticas gerais do conjunto de dados {organism} e {path.split(".")[0]}")
                print(f"\n Quantidade de Instâncias: {len(df.values)}  Quantidade de Atributos: {len(df.columns)}")
                proLongevity = len(df[df['longevity influence'] == 1])
                antiLongevity = len(df[df['longevity influence'] == 0])
                percentPro = proLongevity / (proLongevity + antiLongevity) * 100
                print(f"\n Quantidade Pró-longevidade: {proLongevity}  Quantidade de Anti-longevidade: {antiLongevity} \n")
                print(f"Porcentagem de Pró-longevidade: {percentPro:.4f} \n") 

                datasetTable =  path.split(".")[0]
                if datasetTable == 'BPMF':
                    datasetTable = "BP.MF"
                elif datasetTable == 'BPCC':
                    datasetTable = "BP.CC"
                elif datasetTable == 'MFCC':
                    datasetTable = "MF.CC"
                elif datasetTable == 'BPMFCC':
                    datasetTable = "BP.MF.CC"
                
                table_details.loc[(organismsMap2[organism], datasetTable), ("#Inst")] = str(len(df.values))
                table_details.loc[(organismsMap2[organism], datasetTable), ("#GO")] = str(len(df.columns))
                table_details.loc[(organismsMap2[organism], datasetTable), ("#ClasseP")] = str('{0:.1f}'.format(percentPro))
                #create_folds_arff(X, y, relation=path, org=organism)
                
    





####################################################################################################

Caenorhabditis elegans/BP.csv
Estatísticas gerais do conjunto de dados Caenorhabditis elegans e BP

 Quantidade de Instâncias: 861  Quantidade de Atributos: 1380
861

 Quantidade Pró-longevidade: 283  Quantidade de Anti-longevidade: 578 

Porcentagem de Pró-longevidade: 32.8688 

####################################################################################################

Caenorhabditis elegans/BPCC.csv
Estatísticas gerais do conjunto de dados Caenorhabditis elegans e BPCC

 Quantidade de Instâncias: 861  Quantidade de Atributos: 1771
861

 Quantidade Pró-longevidade: 283  Quantidade de Anti-longevidade: 578 

Porcentagem de Pró-longevidade: 32.8688 

####################################################################################################

Caenorhabditis elegans/BPMF.csv
Estatísticas gerais do conjunto de dados Caenorhabditis elegans e BPMF

 Quantidade de Instância

KeyboardInterrupt: 

In [17]:
table_details

Unnamed: 0,Unnamed: 1,#Inst,#GO,#ClasseP
D. melanogaster,BP,198,1067,63.1
D. melanogaster,MF,198,382,63.1
D. melanogaster,CC,198,190,63.1
D. melanogaster,BP.MF,198,1448,63.1
D. melanogaster,BP.CC,198,1256,63.1
D. melanogaster,MF.CC,198,571,63.1
D. melanogaster,BP.MF.CC,198,1637,63.1
M. musculus,BP,130,2169,65.4
M. musculus,MF,130,540,65.4
M. musculus,CC,130,327,65.4


In [18]:
print(table_details.to_latex())

\begin{tabular}{lllll}
\toprule
 &  & #Inst & #GO & #ClasseP \\
\midrule
\multirow[t]{7}{*}{D. melanogaster} & BP & 198 & 1067 & 63.1 \\
 & MF & 198 & 382 & 63.1 \\
 & CC & 198 & 190 & 63.1 \\
 & BP.MF & 198 & 1448 & 63.1 \\
 & BP.CC & 198 & 1256 & 63.1 \\
 & MF.CC & 198 & 571 & 63.1 \\
 & BP.MF.CC & 198 & 1637 & 63.1 \\
\cline{1-5}
\multirow[t]{7}{*}{M. musculus} & BP & 130 & 2169 & 65.4 \\
 & MF & 130 & 540 & 65.4 \\
 & CC & 130 & 327 & 65.4 \\
 & BP.MF & 130 & 2708 & 65.4 \\
 & BP.CC & 130 & 2495 & 65.4 \\
 & MF.CC & 130 & 866 & 65.4 \\
 & BP.MF.CC & 130 & 3034 & 65.4 \\
\cline{1-5}
\multirow[t]{7}{*}{C. elegans} & BP & 861 & 1380 & 32.9 \\
 & MF & 861 & 701 & 32.9 \\
 & CC & 861 & 392 & 32.9 \\
 & BP.MF & 861 & 2080 & 32.9 \\
 & BP.CC & 861 & 1771 & 32.9 \\
 & MF.CC & 861 & 1092 & 32.9 \\
 & BP.MF.CC & 861 & 2471 & 32.9 \\
\cline{1-5}
\multirow[t]{7}{*}{S. cerevisiae} & BP & 400 & 1397 & 12.8 \\
 & MF & 400 & 833 & 12.8 \\
 & CC & 400 & 415 & 12.8 \\
 & BP.MF & 400 & 2229 & 12.8 \\