In [1]:
%cd ../..

/home/matheus/Desktop/Itens/Projetos/llm2vec-embeddings-classification


In [2]:
import os
import numpy as np
import pandas as pd
from src.core.utils import read_json

# Configura o Pandas para exibir todas as colunas
pd.set_option('display.max_columns', None)

def load_results_to_dataframe(base_path: str) -> pd.DataFrame:
    """
    Load results from JSON files into a pandas DataFrame.
    """
    results = []
    
    # Traverse the directory structure
    for dataset_name in os.listdir(base_path):
        dataset_path = os.path.join(base_path, dataset_name)
        if os.path.isdir(dataset_path):
            for model_type in os.listdir(dataset_path):
                model_type_path = os.path.join(dataset_path, model_type)
                
                if os.path.isdir(model_type_path):
                    for model_name in os.listdir(model_type_path):
                        model_name_path = os.path.join(model_type_path, model_name)
                        
                        # Define paths based on whether prompt_name is needed
                        if model_type != "bert":
                            subdirs = [os.path.join(model_name_path, prompt) for prompt in os.listdir(model_name_path)]
                        else:
                            subdirs = [model_name_path]
                        
                        # Process results.json files from determined paths
                        for subdir in subdirs:
                            for classifier in os.listdir(subdir):
                                classifier_path = os.path.join(subdir, classifier)
                                
                                # Check for the results.json in the classifier path
                                json_file_path = os.path.join(classifier_path, 'results.json')
                                
                                if os.path.isfile(json_file_path):
                                    result_data = read_json(json_file_path)

                                    keys_to_extract = ['mean_test_accuracy', 'mean_test_precision', 'mean_test_recall', 'mean_test_f1_score', 'embedding_generation_time', 'embedding_generation_size']
    
                                    # Extrai apenas as chaves especificadas
                                    result_data= {key: result_data.get(key) for key in keys_to_extract}
                                    
                                    # Add metadata to the result data
                                    result_data['dataset_name'] = dataset_name
                                    result_data['model_type'] = model_type
                                    result_data['model_name'] = model_name
                                    result_data['classifier'] = classifier
                                    
                                    # Add prompt_name if applicable
                                    if model_type != "bert":
                                        result_data['prompt_name'] = os.path.basename(subdir)
                                    else:
                                        result_data['prompt_name'] = None
                                    
                                    results.append(result_data)

    # Create a DataFrame from the results
    results_df = pd.DataFrame(results)

    # Specify the order of the columns
    columns_first = ['dataset_name', 'model_type', 'model_name', 'classifier']
    if 'prompt_name' in results_df.columns:
        columns_first.append('prompt_name')
    column_order = columns_first + [col for col in results_df.columns if col not in columns_first]
    results_df = results_df[column_order]
    
    return results_df



In [3]:
# Usage
base_path = 'results' 
results_df = load_results_to_dataframe(base_path)

print("DataFrame of Results:")
display(results_df)

# Save the DataFrame to a CSV file
results_df.to_csv(f'{base_path}/resume/result_resume.csv', index=False)  # Include index=False to avoid saving the index as a column

DataFrame of Results:


Unnamed: 0,dataset_name,model_type,model_name,classifier,prompt_name,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1_score,embedding_generation_time,embedding_generation_size
0,Dmoz-Computers.csv,bert,sentence-transformers_all-distilroberta-v1,knn,,0.723684,0.719085,0.718167,0.708111,4.272044,29184128
1,Dmoz-Computers.csv,bert,sentence-transformers_all-MiniLM-L6-v2,knn,,0.728842,0.721016,0.723667,0.715111,3.669558,14592128
2,Dmoz-Computers.csv,bert,sentence-transformers_all-MiniLM-L12-v2,knn,,0.728947,0.722362,0.722222,0.712460,4.991892,14592128
3,Dmoz-Computers.csv,bert,sentence-transformers_all-mpnet-base-v2,knn,,0.738947,0.729568,0.733333,0.724480,9.260681,29184128
4,Dmoz-Computers.csv,llm2vec,McGill-NLP_LLM2Vec-Sheared-LLaMA-mntp-supervised,knn,instruction_summary_prompt,0.779474,0.777606,0.775000,0.769049,41.051134,128
...,...,...,...,...,...,...,...,...,...,...,...
105,NSF.csv,llm2vec,McGill-NLP_LLM2Vec-Mistral-7B-Instruct-v2-mntp...,knn,instruction_classification_prompt,0.879703,0.873133,0.878243,0.874888,239.332735,128
106,NSF.csv,llm2vec,McGill-NLP_LLM2Vec-Mistral-7B-Instruct-v2-mntp...,knn,base_prompt,0.880369,0.880452,0.874284,0.876349,63.920336,128
107,NSF.csv,llm2vec,McGill-NLP_LLM2Vec-Meta-Llama-3-8B-Instruct-mn...,knn,instruction_summary_prompt,0.887780,0.894188,0.882074,0.886262,137.549927,128
108,NSF.csv,llm2vec,McGill-NLP_LLM2Vec-Meta-Llama-3-8B-Instruct-mn...,knn,instruction_classification_prompt,0.876093,0.882923,0.868693,0.873548,221.728992,128


In [18]:
datasets = results_df['dataset_name'].unique()

for dataset in datasets:
    print(f"Dataset: {dataset}")
    sorted_df = results_df[results_df['dataset_name'] == dataset].sort_values(by="mean_test_f1_score", ascending=False)
    sorted_df = sorted_df.drop(["dataset_name", "classifier", "embedding_generation_size"], axis=1)
    
    # Renomeia as colunas usando os nomes já modificados no DataFrame anterior
    sorted_df = sorted_df.rename(columns={
        'model_type': "Tipo de Modelo",
        "model_name": "Nome do Modelo",
        "prompt_name": "Prompt",
        'mean_test_accuracy': 'Acurácia',
        'mean_test_precision': 'Precisão',
        'mean_test_recall': 'Recall',
        'mean_test_f1_score': 'F1 Score',
        'embedding_generation_time': "Tempo de geração de embeddings"
    })
    
    # Arredonda as colunas numéricas para 3 casas decimais
    sorted_df[['Acurácia', 'Precisão', 'Recall', 'F1 Score', 'Tempo de geração de embeddings']] = sorted_df[['Acurácia', 'Precisão', 'Recall', 'F1 Score', 'Tempo de geração de embeddings']].round(3)
    sorted_df = sorted_df.fillna('-')
#     replace = {
#     'base_prompt': 'base prompt',
#     'instruction_classification_prompt': 'instruction classification prompt',
#     'instruction_summary_prompt': 'instruction summary prompt'
# }

#     # Substituir todas as ocorrências conforme o dicionário
#     sorted_df= sorted_df.replace(replace)

    sorted_df = sorted_df.replace({'sentence-transformers_': '',
                                   'McGill-NLP_LLM2Vec-': '',
                                   'base_prompt': 'BP',
                                   'instruction_classification_prompt': 'ICP',
                                   'instruction_summary_prompt': 'ISP'}, 
                                   regex=True)
    display(sorted_df)
    
    # Salva o DataFrame em CSV
    sorted_df.to_csv(f'{base_path}/resume/{dataset}', index=False)

Dataset: Dmoz-Computers.csv


Unnamed: 0,Tipo de Modelo,Nome do Modelo,Prompt,Acurácia,Precisão,Recall,F1 Score,Tempo de geração de embeddings
21,llm2vec,Meta-Llama-3-8B-Instruct-mntp-supervised,BP,0.809,0.809,0.804,0.797,75.441
18,llm2vec,Mistral-7B-Instruct-v2-mntp-unsup-simcse,BP,0.796,0.794,0.792,0.785,87.927
15,llm2vec,Mistral-7B-Instruct-v2-mntp-supervised,BP,0.797,0.798,0.793,0.783,131.421
17,llm2vec,Mistral-7B-Instruct-v2-mntp-unsup-simcse,ICP,0.791,0.789,0.787,0.778,258.517
16,llm2vec,Mistral-7B-Instruct-v2-mntp-unsup-simcse,ISP,0.79,0.787,0.786,0.778,153.138
12,llm2vec,Meta-Llama-3-8B-Instruct-mntp-unsup-simcse,BP,0.788,0.786,0.785,0.777,76.674
10,llm2vec,Meta-Llama-3-8B-Instruct-mntp-unsup-simcse,ISP,0.783,0.785,0.779,0.772,130.051
11,llm2vec,Meta-Llama-3-8B-Instruct-mntp-unsup-simcse,ICP,0.781,0.779,0.778,0.77,215.624
4,llm2vec,Sheared-LLaMA-mntp-supervised,ISP,0.779,0.778,0.775,0.769,41.051
6,llm2vec,Sheared-LLaMA-mntp-supervised,BP,0.78,0.777,0.775,0.768,47.868


Dataset: Dmoz-Science.csv


Unnamed: 0,Tipo de Modelo,Nome do Modelo,Prompt,Acurácia,Precisão,Recall,F1 Score,Tempo de geração de embeddings
43,llm2vec,Meta-Llama-3-8B-Instruct-mntp-supervised,BP,0.852,0.858,0.852,0.851,57.135
26,llm2vec,Sheared-LLaMA-mntp-supervised,ISP,0.834,0.84,0.834,0.831,32.557
39,llm2vec,Mistral-7B-Instruct-v2-mntp-unsup-simcse,ICP,0.833,0.837,0.833,0.83,171.946
37,llm2vec,Mistral-7B-Instruct-v2-mntp-supervised,BP,0.832,0.84,0.832,0.829,60.97
36,llm2vec,Mistral-7B-Instruct-v2-mntp-supervised,ICP,0.829,0.835,0.829,0.826,154.041
28,llm2vec,Sheared-LLaMA-mntp-supervised,BP,0.828,0.835,0.828,0.825,21.297
25,bert,all-mpnet-base-v2,-,0.824,0.828,0.824,0.822,3.978
34,llm2vec,Meta-Llama-3-8B-Instruct-mntp-unsup-simcse,BP,0.824,0.829,0.824,0.822,51.425
40,llm2vec,Mistral-7B-Instruct-v2-mntp-unsup-simcse,BP,0.821,0.826,0.821,0.818,59.234
38,llm2vec,Mistral-7B-Instruct-v2-mntp-unsup-simcse,ISP,0.817,0.823,0.817,0.814,117.917


Dataset: Industry Sector.csv


Unnamed: 0,Tipo de Modelo,Nome do Modelo,Prompt,Acurácia,Precisão,Recall,F1 Score,Tempo de geração de embeddings
65,llm2vec,Meta-Llama-3-8B-Instruct-mntp-supervised,BP,0.938,0.941,0.932,0.936,512.522
59,llm2vec,Mistral-7B-Instruct-v2-mntp-supervised,BP,0.932,0.937,0.924,0.93,565.922
63,llm2vec,Meta-Llama-3-8B-Instruct-mntp-supervised,ISP,0.925,0.923,0.918,0.92,540.101
58,llm2vec,Mistral-7B-Instruct-v2-mntp-supervised,ICP,0.918,0.923,0.909,0.915,636.968
64,llm2vec,Meta-Llama-3-8B-Instruct-mntp-supervised,ICP,0.914,0.921,0.905,0.912,587.214
48,llm2vec,Sheared-LLaMA-mntp-supervised,ISP,0.905,0.909,0.893,0.9,197.315
50,llm2vec,Sheared-LLaMA-mntp-supervised,BP,0.899,0.917,0.883,0.898,179.943
57,llm2vec,Mistral-7B-Instruct-v2-mntp-supervised,ISP,0.901,0.907,0.889,0.897,588.889
49,llm2vec,Sheared-LLaMA-mntp-supervised,ICP,0.883,0.913,0.862,0.885,188.775
46,bert,all-MiniLM-L12-v2,-,0.873,0.895,0.854,0.873,10.267


Dataset: review_polarity.csv


Unnamed: 0,Tipo de Modelo,Nome do Modelo,Prompt,Acurácia,Precisão,Recall,F1 Score,Tempo de geração de embeddings
80,llm2vec,Mistral-7B-Instruct-v2-mntp-supervised,ICP,0.85,0.868,0.85,0.849,158.961
79,llm2vec,Mistral-7B-Instruct-v2-mntp-supervised,ISP,0.844,0.854,0.844,0.843,162.43
86,llm2vec,Meta-Llama-3-8B-Instruct-mntp-supervised,ICP,0.844,0.853,0.844,0.842,169.862
85,llm2vec,Meta-Llama-3-8B-Instruct-mntp-supervised,ISP,0.817,0.822,0.816,0.816,218.696
87,llm2vec,Meta-Llama-3-8B-Instruct-mntp-supervised,BP,0.798,0.801,0.798,0.798,192.309
81,llm2vec,Mistral-7B-Instruct-v2-mntp-supervised,BP,0.796,0.802,0.796,0.794,146.74
83,llm2vec,Mistral-7B-Instruct-v2-mntp-unsup-simcse,ICP,0.777,0.79,0.777,0.774,158.887
84,llm2vec,Mistral-7B-Instruct-v2-mntp-unsup-simcse,BP,0.772,0.787,0.772,0.769,147.026
76,llm2vec,Meta-Llama-3-8B-Instruct-mntp-unsup-simcse,ISP,0.764,0.782,0.764,0.76,160.104
82,llm2vec,Mistral-7B-Instruct-v2-mntp-unsup-simcse,ISP,0.764,0.786,0.764,0.76,162.444


Dataset: NSF.csv


Unnamed: 0,Tipo de Modelo,Nome do Modelo,Prompt,Acurácia,Precisão,Recall,F1 Score,Tempo de geração de embeddings
109,llm2vec,Meta-Llama-3-8B-Instruct-mntp-supervised,BP,0.898,0.905,0.892,0.896,55.15
100,llm2vec,Meta-Llama-3-8B-Instruct-mntp-unsup-simcse,BP,0.894,0.892,0.89,0.89,60.675
98,llm2vec,Meta-Llama-3-8B-Instruct-mntp-unsup-simcse,ISP,0.889,0.891,0.886,0.887,117.35
107,llm2vec,Meta-Llama-3-8B-Instruct-mntp-supervised,ISP,0.888,0.894,0.882,0.886,137.55
91,bert,all-mpnet-base-v2,-,0.885,0.889,0.883,0.885,3.712
104,llm2vec,Mistral-7B-Instruct-v2-mntp-unsup-simcse,ISP,0.884,0.886,0.878,0.881,131.128
90,bert,all-MiniLM-L12-v2,-,0.881,0.886,0.876,0.879,2.771
103,llm2vec,Mistral-7B-Instruct-v2-mntp-supervised,BP,0.882,0.887,0.875,0.879,79.218
102,llm2vec,Mistral-7B-Instruct-v2-mntp-supervised,ICP,0.881,0.885,0.876,0.878,237.751
99,llm2vec,Meta-Llama-3-8B-Instruct-mntp-unsup-simcse,ICP,0.88,0.885,0.874,0.877,215.68


In [None]:
results_df.columns

Index(['dataset_name', 'model_type', 'model_name', 'classifier', 'prompt_name',
       'mean_test_accuracy', 'mean_test_precision', 'mean_test_recall',
       'mean_test_f1_score', 'embedding_generation_time',
       'embedding_generation_size'],
      dtype='object')

In [None]:
import pdfkit

# Lista de datasets únicos
datasets = results_df['dataset_name'].unique()

columns = ['model_type', 'model_name', 'fit_time', 'score_time', 'test_accuracy',  'test_precision', 'test_recall', 'test_f1_score','embedding_generation_time', 'embeddings_size']

# HTML para o PDF consolidado
html_content = ""

for dataset in datasets:
    # Filtra o DataFrame por dataset
    filtered_df = results_df[results_df['dataset_name'] == dataset]
    filtered_df = filtered_df[columns]
    filtered_df = filtered_df.sort_values(by = "test_f1_score", ascending = False)
    
    # Adiciona título e conteúdo do DataFrame ao HTML
    html_content += f"<h2>Dataset: {dataset}</h2>"
    html_content += filtered_df.to_html(index=False)
    html_content += "<br><br>"  # Adiciona um espaço entre os datasets

# Gera o PDF consolidado
pdf_filename = "consolidated_report.pdf"
pdfkit.from_string(html_content, pdf_filename)
print(f"PDF gerado: {pdf_filename}")

KeyError: "['fit_time', 'score_time', 'test_accuracy', 'test_precision', 'test_recall', 'test_f1_score', 'embeddings_size'] not in index"