In [1]:
import numpy as np
import parsl
from parsl import python_app
from parsl.config import Config
from parsl.executors.threads import ThreadPoolExecutor
import pandas as pd
import os
import re
import pandas as pd
from transformers import BertModel, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity
import time 
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config = Config(executors=[ThreadPoolExecutor()])
parsl.load(config)

<parsl.dataflow.dflow.DataFlowKernel at 0x7f6e38077d00>

In [3]:
torch.cuda.set_per_process_memory_fraction(0.5,0)

In [4]:
def extractName(text):
  return re.findall(r'"name"\s*:\s*"([^"]+)"',text)

def extractTag(text):
  matches = re.findall(r'\"tags\": \[(.*?)\]', text, re.DOTALL)
  matches = re.findall(r'\"(.*?)\"', matches[0].strip(','))
  return matches

def extractDescription(text):
  return re.findall(r'"description"\s*:\s*"([^"]+)"',text)

def extractLabel(text):
    return re.findall(r'"label"\s*:\s*"([^"]+)"', text)

def removeWords(text, words):
  for word in words:
    text = text.replace(word,"")
  return text

In [5]:
os.chdir(os.path.join(os.pardir,os.pardir, 'workflows_galaxy'))
nomes_arquivos = []
diretorio = os.getcwd()
print(diretorio)
for item in os.listdir(diretorio):
    caminho_completo = os.path.join(diretorio, item)
    if os.path.isfile(caminho_completo):
        nomes_arquivos.append(item)

print(len(nomes_arquivos))


/home/lyncoln/Git/similaridade_workflow/workflows_galaxy
1014


In [6]:
dic_workflows = {}
@python_app
def processar_arquivo(arquivo):
    # Carregar o arquivo JSON
    with open(arquivo, 'r') as file:
        fileName = file.name
        dados = file.read()
        matches_name = extractName(dados)
        matches_tag = extractTag(dados)
        matches_description = extractDescription(dados)
        matches_label = extractLabel(dados)

        combined_results = {
        'Tags': matches_tag if matches_tag else [],  
        'Descricao': ' '.join(matches_name + matches_tag + matches_description + matches_label),
        'Json': dados
        }
        dic_workflows[fileName] = combined_results

        return combined_results


futures = [processar_arquivo(arquivo) for arquivo in nomes_arquivos]
resultados = [future.result() for future in futures]

In [7]:
dic_workflows

{'0c86c39dcd9e08c6.json': {'Tags': [], 'Content': 'Project_CP'},
 '692a2b0bb818336d.json': {'Tags': [],
  'Content': "Workflow constructed from history 'K22063917 Assignment attempt FINAL - using wANNOVAR' annotation.bed Input dataset NGS0001.R1.fastq.qz Input dataset NGS0001.R2.fastq.qz Input dataset query.output.exome_summary 5th attempt.txt Input dataset FastQC html_file text_file fastqc FastQC html_file text_file fastqc Trimmomatic fastq_out_r1_paired fastq_out_r2_paired fastq_out_r1_unpaired fastq_out_r2_unpaired trimmomatic Filter out_file1 Filter out_file1 Filter out_file1 Filter out_file1 Filter out_file1 Filter out_file1 Filter out_file1 Filter out_file1 FastQC html_file text_file fastqc FastQC html_file text_file fastqc Map with BWA-MEM bam_output bwa MarkDuplicates metrics_file outFile picard Filter SAM or BAM, output SAM or BAM output1 samtool_filter2 Flagstat output1 samtools_flagstat Flagstat output1 samtools_flagstat BAM-to-SAM output1 bam_to_sam IdxStats output samtools

In [8]:
# Função para converter texto em embedding
def text_to_embedding(text, tokenizer, device, model):
    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    encoded_input = {key: value.to(device) for key, value in encoded_input.items()}  # Mover tensores para o dispositivo
    with torch.no_grad():
        model_output = model(**encoded_input)
    # Pegar a média dos embeddings de todos os tokens para representar o texto
    return model_output.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Mover o resultado de volta para a CPU e converter para numpy

In [9]:
df = pd.DataFrame.from_dict(dic_workflows, orient='index').reset_index()

top_x_list = list(range(3,11))  

execution_times = []

In [10]:
for top_x in top_x_list:
    # Medir o tempo de execução
    start_time = time.time()

    # Carregar o modelo e o tokenizer
    tokenizer = BertTokenizer.from_pretrained('google-bert/bert-large-uncased')
    model = BertModel.from_pretrained('google-bert/bert-large-uncased')

    # Definir o dispositivo (GPU ou CPU)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Supondo que você já tem um DataFrame df com uma coluna 'text'
    texts = df['Content'].tolist()

    # Converter todos os textos para embeddings
    embeddings = [text_to_embedding(text, tokenizer, device, model) for text in texts]

    # Converter a lista de embeddings em um array 2D
    embeddings_array = np.vstack(embeddings)

    # Calcular a matriz de similaridade
    similarity_matrix = cosine_similarity(embeddings_array)
    
    # Identificar os índices dos textos mais similares para cada texto
    similar_indices = similarity_matrix.argsort(axis=1)[:, :-top_x-2:-1]  # Selecionar os top_x mais similares excluindo o próprio texto

    # Remover o índice do próprio texto
    corrected_similar_indices = []
    corrected_similar_tags = []
    for idx, indices in enumerate(similar_indices):
        filtered_indices = [index for index in indices if index != idx][:top_x]  # Exclui o próprio e pega os top_x mais similares
        filtered_tags = [df.iloc[index]['Tags'] for index in filtered_indices]  # Obter as tags dos textos mais similares
        corrected_similar_indices.append(filtered_indices)
        corrected_similar_tags.append(filtered_tags)

    # Criar coluna no DataFrame para os índices dos textos mais similares
    df[f'top{top_x}_description'] = corrected_similar_indices

    # Criar coluna no DataFrame para as tags dos textos mais similares
    df[f'top{top_x}_tags'] = corrected_similar_tags

    # Calcular a média das similaridades dos textos mais similares para cada texto
    mean_similarities = []
    for idx, indices in enumerate(corrected_similar_indices):
        similarities = [similarity_matrix[idx, i] for i in indices]
        mean_similarity = np.mean(similarities)
        mean_similarities.append(mean_similarity)

    df[f'mean_similarity_top{top_x}'] = mean_similarities

    # Calcular o tempo de execução
    execution_time = time.time() - start_time

    # Adicionar o tempo de execução ao DataFrame de tempos de execução
    execution_times.append({'top_x': top_x, 'execution_time': execution_time})

    torch.cuda.empty_cache()



In [11]:
def text_to_embedding(text, tokenizer, device, model):
    inputs  = tokenizer(text, return_tensors='pt', max_length=512, padding=True, truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    
    return embedding

In [12]:
for top_x in top_x_list:
    # Medir o tempo de execução
    start_time = time.time()

    # Carregar o modelo e o tokenizer
    tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
    model = BertModel.from_pretrained('allenai/scibert_scivocab_uncased')

    # Definir o dispositivo (GPU ou CPU)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Supondo que você já tem um DataFrame df com uma coluna 'text'
    texts = df['Content'].tolist()

    # Converter todos os textos para embeddings
    embeddings = [text_to_embedding(text, tokenizer, device, model) for text in texts]

    # Converter a lista de embeddings em um array 2D
    embeddings_array = np.vstack(embeddings)

    # Calcular a matriz de similaridade
    similarity_matrix = cosine_similarity(embeddings_array)
    
    # Identificar os índices dos textos mais similares para cada texto
    similar_indices = similarity_matrix.argsort(axis=1)[:, :-top_x-2:-1]  # Selecionar os top_x mais similares excluindo o próprio texto

    # Remover o índice do próprio texto
    corrected_similar_indices = []
    corrected_similar_tags = []
    for idx, indices in enumerate(similar_indices):
        filtered_indices = [index for index in indices if index != idx][:top_x]  # Exclui o próprio e pega os top_x mais similares
        filtered_tags = [df.iloc[index]['Tags'] for index in filtered_indices]  # Obter as tags dos textos mais similares
        corrected_similar_indices.append(filtered_indices)
        corrected_similar_tags.append(filtered_tags)

    # Criar coluna no DataFrame para os índices dos textos mais similares
    df[f'top{top_x}_description_scibert'] = corrected_similar_indices

    # Criar coluna no DataFrame para as tags dos textos mais similares
    df[f'top{top_x}_tags_scibert'] = corrected_similar_tags

    # Calcular a média das similaridades dos textos mais similares para cada texto
    mean_similarities = []
    for idx, indices in enumerate(corrected_similar_indices):
        similarities = [similarity_matrix[idx, i] for i in indices]
        mean_similarity = np.mean(similarities)
        mean_similarities.append(mean_similarity)

    df[f'mean_similarity_top{top_x}_scibert'] = mean_similarities

    # Calcular o tempo de execução
    execution_time = time.time() - start_time

    # Adicionar o tempo de execução ao DataFrame de tempos de execução
    execution_times.append({'top_x': top_x, 'execution_time_scibert': execution_time})

    torch.cuda.empty_cache()




In [13]:
execution_times_df = pd.DataFrame(execution_times)
execution_times_df

Unnamed: 0,top_x,execution_time,execution_time_scibert
0,3,36.142221,
1,4,34.818485,
2,5,35.413346,
3,6,36.139543,
4,7,38.507025,
5,8,43.712143,
6,9,44.876248,
7,10,44.228637,
8,3,,18.67763
9,4,,20.982416


In [15]:
df

Unnamed: 0,index,Tags,Content,top3_description,top3_tags,mean_similarity_top3,top4_description,top4_tags,mean_similarity_top4,top5_description,...,mean_similarity_top7_scibert,top8_description_scibert,top8_tags_scibert,mean_similarity_top8_scibert,top9_description_scibert,top9_tags_scibert,mean_similarity_top9_scibert,top10_description_scibert,top10_tags_scibert,mean_similarity_top10_scibert
0,0c86c39dcd9e08c6.json,[],Project_CP,"[750, 423, 772]","[[aptamer, SELEX], [], []]",0.795055,"[750, 423, 772, 350]","[[aptamer, SELEX], [], [], []]",0.790423,"[750, 423, 772, 350, 482]",...,0.796031,"[350, 476, 57, 866, 423, 443, 482, 48]","[[], [], [], [], [], [], [], []]",0.793780,"[350, 476, 57, 866, 423, 443, 482, 48, 680]","[[], [], [], [], [], [], [], [], []]",0.792029,"[350, 476, 57, 866, 423, 443, 482, 48, 680, 263]","[[], [], [], [], [], [], [], [], [], []]",0.790067
1,692a2b0bb818336d.json,[],Workflow constructed from history 'K22063917 A...,"[610, 726, 29]","[[], [], []]",0.999838,"[610, 726, 29, 399]","[[], [], [], []]",0.999693,"[610, 726, 29, 399, 719]",...,0.997155,"[610, 726, 29, 399, 719, 896, 391, 30]","[[], [], [], [], [], [], [], []]",0.996471,"[610, 726, 29, 399, 719, 896, 391, 30, 905]","[[], [], [], [], [], [], [], [], []]",0.995911,"[610, 726, 29, 399, 719, 896, 391, 30, 905, 570]","[[], [], [], [], [], [], [], [], [], []]",0.995462
2,3680984663c813e1.json,[],dhfr Bowtie2 output bowtie2 MPileup output_mpi...,"[571, 90, 238]","[[], [], []]",0.934421,"[571, 90, 238, 916]","[[], [], [], []]",0.932715,"[571, 90, 238, 916, 482]",...,0.919648,"[458, 252, 939, 321, 693, 280, 276, 99]","[[], [], [], [], [], [], [], []]",0.918999,"[458, 252, 939, 321, 693, 280, 276, 99, 818]","[[], [], [], [], [], [], [], [], []]",0.918369,"[458, 252, 939, 321, 693, 280, 276, 99, 818, 278]","[[], [], [], [], [], [], [], [], [], []]",0.917866
3,49f8b32c3206f76c.json,"[variant, snps, human]",Workflow for Genomic Data Science with Galaxy ...,"[73, 907, 328]","[[], [], []]",0.982511,"[73, 907, 328, 306]","[[], [], [], []]",0.982288,"[73, 907, 328, 306, 378]",...,0.928937,"[674, 255, 140, 519, 248, 194, 440, 630]","[[], [], [], [], [], [], [], []]",0.928418,"[674, 255, 140, 519, 248, 194, 440, 630, 165]","[[], [], [], [], [], [], [], [], []]",0.927898,"[674, 255, 140, 519, 248, 194, 440, 630, 165, ...","[[], [], [], [], [], [], [], [], [], [Polymorp...",0.927131
4,ab23c641cfdefe0c.json,[],Test workflow 1 Input dataset FASTQ Summary St...,"[61, 360, 766]","[[], [], []]",0.968137,"[61, 360, 766, 206]","[[], [], [], []]",0.966359,"[61, 360, 766, 206, 278]",...,0.951834,"[818, 278, 129, 321, 458, 67, 680, 48]","[[], [], [], [], [], [], [], []]",0.950426,"[818, 278, 129, 321, 458, 67, 680, 48, 571]","[[], [], [], [], [], [], [], [], []]",0.949165,"[818, 278, 129, 321, 458, 67, 680, 48, 571, 413]","[[], [], [], [], [], [], [], [], [], []]",0.948109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1009,d5cba8a5ba6880fd.json,[],BRACA2 - primer design exon Input dataset snp ...,"[264, 34, 727]","[[], [], []]",0.987313,"[264, 34, 727, 565]","[[], [], [], []]",0.985496,"[264, 34, 727, 565, 985]",...,0.965617,"[34, 264, 615, 871, 565, 985, 727, 651]","[[], [], [], [], [], [], [], []]",0.964628,"[34, 264, 615, 871, 565, 985, 727, 651, 632]","[[], [], [], [], [], [], [], [], []]",0.963386,"[34, 264, 615, 871, 565, 985, 727, 651, 632, 911]","[[], [], [], [], [], [], [], [], [], []]",0.961873
1010,e0da87cadb1e6d5f.json,[],2 peaks-result data file 2 and more peaks 16 c...,"[189, 809, 810]","[[], [], []]",0.993895,"[189, 809, 810, 96]","[[], [], [], []]",0.990989,"[189, 809, 810, 96, 643]",...,0.958131,"[189, 809, 810, 96, 919, 106, 816, 405]","[[], [], [], [], [], [], [], []]",0.954870,"[189, 809, 810, 96, 919, 106, 816, 405, 665]","[[], [], [], [], [], [], [], [], []]",0.952122,"[189, 809, 810, 96, 919, 106, 816, 405, 665, 113]","[[], [], [], [], [], [], [], [], [], []]",0.949804
1011,fbf75fbb72b488bd.json,[],'BBL735_Lab2(Olympic)_AT' olympics.tsv Input d...,"[848, 24, 934]","[[COVID-19, covid19.galaxyproject.org], [], [m...",0.973935,"[848, 24, 934, 735]","[[COVID-19, covid19.galaxyproject.org], [], [m...",0.973825,"[848, 24, 934, 735, 918]",...,0.896717,"[758, 540, 110, 621, 646, 38, 1000, 625]","[[], [], [], [], [], [], [], []]",0.895561,"[758, 540, 110, 621, 646, 38, 1000, 625, 178]","[[], [], [], [], [], [], [], [], []]",0.894402,"[758, 540, 110, 621, 646, 38, 1000, 625, 178, ...","[[], [], [], [], [], [], [], [], [], []]",0.893460
1012,f8238234db6f04c3.json,[],handson Input dataset Input dataset Join outpu...,"[182, 696, 872]","[[], [], []]",0.961614,"[182, 696, 872, 615]","[[], [], [], []]",0.960046,"[182, 696, 872, 615, 871]",...,0.950065,"[182, 696, 911, 196, 988, 921, 576, 872]","[[], [], [], [], [], [], [], []]",0.949000,"[182, 696, 911, 196, 988, 921, 576, 872, 113]","[[], [], [], [], [], [], [], [], []]",0.947935,"[182, 696, 911, 196, 988, 921, 576, 872, 113, ...","[[], [], [], [], [], [], [], [], [], []]",0.946997


In [14]:
os.chdir(os.pardir)
df.to_csv("topx_descricao.csv")
execution_times_df.to_csv("topx_times_descricao.csv")