In [17]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import pandas as pd
import arff
import os

organisms = ['Caenorhabditis elegans', 'Drosophila melanogaster', 'Mus musculus', 'Saccharomyces cerevisiae']
organismsMap = {'Caenorhabditis elegans': 'worm', 'Drosophila melanogaster': 'fly', 'Mus musculus': 'mouse', 'Saccharomyces cerevisiae': 'yeast'}

# Definindo o número de folds
num_folds = 10
kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

# Função para salvar dados em arquivos ARFF
def salvar_arquivo_arff(nome_arquivo, dados, atributos, relation):
    with open(nome_arquivo, 'w') as f:
        arff.dump({
            'relation': relation,
            'attributes': [(col, 'REAL') for col in atributos],
            'data': dados
        }, f)

# Função para criar e salvar os folds
def create_folds_arff(X, y, relation, org, val):
    for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
        # Dividindo o fold em treino, validação e teste
        train_df = df.iloc[train_index[:int(len(train_index) * 0.8)]]
        test_df = df.iloc[train_index[int(len(train_index) * 0.8):]]

        # Salvando os dados em arquivos ARFF
        valPath = ""  
        if val:
            valPath = "-agmo"        
        traName = f"{organismsMap[org]}-{path.split(".")[0]}_fold_{fold}_tra"
        tstName = f"{organismsMap[org]}-{path.split(".")[0]}_fold_{fold}_tst"
        salvar_arquivo_arff(f"{org}/folds{valPath}/{path.split(".")[0]}/{traName}.arff", train_df.values, train_df.columns, relation=traName)
        salvar_arquivo_arff(f"{org}/folds{valPath}/{path.split(".")[0]}/{tstName}.arff", test_df.values, test_df.columns, relation=tstName)

for organism in organisms:
    for path in os.listdir(f"{organism}"):
            allpath = f"{organism}/{path}"
            if os.path.isfile(allpath):
                print(allpath)
                df = pd.read_csv(allpath)
                # Dividir os dados em conjunto de dados de treino e teste (80% treino-teste, 20% validacao (treino+teste))
                treino_teste_df, validacao_df = train_test_split(df, test_size=0.2, random_state=42)
                
                X_treino_teste = treino_teste_df.drop('longevity influence', axis=1)
                y_teste_teste = treino_teste_df['longevity influence']
                
                X_treino_validacao = validacao_df.drop('longevity influence', axis=1)
                y_teste_validacao = validacao_df['longevity influence']
    
                create_folds_arff(X_treino_teste, y_teste_teste, relation=path, org=organism, val=False)
                create_folds_arff(X_treino_validacao, y_teste_validacao, relation=path, org=organism, val=True)






Caenorhabditis elegans/BP.csv
Caenorhabditis elegans/BPCC.csv
Caenorhabditis elegans/BPMF.csv
Caenorhabditis elegans/BPMFCC.csv
Caenorhabditis elegans/CC.csv
Caenorhabditis elegans/MF.csv
Caenorhabditis elegans/MFCC.csv
Drosophila melanogaster/BP.csv
Drosophila melanogaster/BPCC.csv
Drosophila melanogaster/BPMF.csv
Drosophila melanogaster/BPMFCC.csv
Drosophila melanogaster/CC.csv
Drosophila melanogaster/MF.csv
Drosophila melanogaster/MFCC.csv
Mus musculus/BP.csv




Mus musculus/BPCC.csv




Mus musculus/BPMF.csv




Mus musculus/BPMFCC.csv




Mus musculus/CC.csv




Mus musculus/MF.csv




Mus musculus/MFCC.csv




Saccharomyces cerevisiae/BP.csv
Saccharomyces cerevisiae/BPCC.csv
Saccharomyces cerevisiae/BPMF.csv


KeyboardInterrupt: 

In [None]:
# Dividir os dados em conjunto de dados de treino e teste (80% treino-teste, 20% validacao-teste)
treino_df, validacao_df = train_test_split(df, test_size=0.2, random_state=42)
