In [1]:
%cd ../..

/home/matheus/Desktop/Itens/Projetos/paper - llm embeddings to classification - marcacini - matheus


In [2]:
import os
import numpy as np
import pandas as pd
import json

# Configura o Pandas para exibir todas as colunas
pd.set_option('display.max_columns', None)

def load_results_to_dataframe(base_path: str) -> pd.DataFrame:
    """
    Load results from JSON files into a pandas DataFrame.

    This function traverses the directory structure under the given base path, 
    finds the results.json files, calculates the average for list values, 
    and compiles the results into a DataFrame with additional metadata 
    such as dataset name, model type, and model name.

    Parameters:
    - base_path (str): The base directory path containing the results.

    Returns:
    - pd.DataFrame: A DataFrame containing the consolidated results.
    """
    results = []
    
    # Traverse the directory structure
    for dataset_name in os.listdir(base_path):
        dataset_path = os.path.join(base_path, dataset_name)
        if os.path.isdir(dataset_path):
            for model_type in os.listdir(dataset_path):
                model_type_path = os.path.join(dataset_path, model_type)
                
                if os.path.isdir(model_type_path):
                    for model_name in os.listdir(model_type_path):
                        model_name_path = os.path.join(model_type_path, model_name)
                        json_file_path = os.path.join(model_name_path, 'results.json')
                        
                        embeddings_path = os.path.join(model_name_path, 'embeddings.npy')
                        if(os.path.exists(embeddings_path)):
                            embeddings_size = os.path.getsize(embeddings_path)
                        else:
                            embeddings_size = None
                        # Check if results.json file exists
                        if os.path.isfile(json_file_path):
                            with open(json_file_path, 'r') as json_file:
                                result_data = json.load(json_file)
                                
                                # Calculate the mean for each list
                                for key in result_data.keys():
                                    if isinstance(result_data[key], list):
                                        result_data[key] = sum(result_data[key]) / len(result_data[key])
                                
                                # Add metadata to the result data
                                result_data['dataset_name'] = dataset_name
                                result_data['model_type'] = model_type
                                result_data['model_name'] = model_name
                                result_data['embeddings_size'] = embeddings_size
                                results.append(result_data)

    # Create a DataFrame from the results
    results_df = pd.DataFrame(results)

    # Specify the order of the columns, putting 'dataset_name', 'model_type', and 'model_name' first
    columns_first = ['dataset_name', 'model_type', 'model_name']
    column_order = columns_first + [col for col in results_df.columns if col not in columns_first]
    results_df = results_df[column_order]
    
    return results_df

In [4]:
# Usage
base_path = 'results' 
results_df = load_results_to_dataframe(base_path)

print("DataFrame of Results:")
display(results_df)

# Save the DataFrame to a CSV file
results_df.to_csv(f'{base_path}/result_resume.csv', index=False)  # Include index=False to avoid saving the index as a column

DataFrame of Results:


Unnamed: 0,dataset_name,model_type,model_name,fit_time,score_time,test_accuracy,train_accuracy,test_precision,train_precision,test_recall,train_recall,test_f1_score,train_f1_score,embedding_generation_time,embeddings_size
0,SyskillWebert.csv,bert,sentence-transformers_all-MiniLM-L6-v2,0.000506,0.017229,0.913116,0.929647,0.910773,0.930965,0.896079,0.915246,0.898112,0.921168,2.715869,513152
1,SyskillWebert.csv,bert,sentence-transformers_all-MiniLM-L12-v2,0.000531,0.016526,0.910176,0.929641,0.915087,0.931200,0.896763,0.917989,0.901097,0.923040,2.643802,513152
2,SyskillWebert.csv,bert,sentence-transformers_all-mpnet-base-v2,0.000780,0.019538,0.901085,0.913170,0.904716,0.914158,0.884785,0.896956,0.888635,0.902096,3.382855,1026176
3,Dmoz-Computers.csv,bert,sentence-transformers_all-MiniLM-L6-v2,0.008594,0.376299,0.721474,0.773211,0.717329,0.775273,0.715111,0.767667,0.705807,0.760638,6.331421,14592128
4,Dmoz-Computers.csv,bert,sentence-transformers_all-MiniLM-L12-v2,0.009866,0.428401,0.719474,0.771974,0.714759,0.774625,0.711611,0.765375,0.699790,0.757391,3.932108,14592128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,webkb-parsed.csv,llm2vec,McGill-NLP_LLM2Vec-Sheared-LLaMA-mntp-unsup-si...,0.044932,0.394923,0.737021,0.782058,0.603201,0.734023,0.653469,0.698658,0.599817,0.652680,152.077099,67846272
98,webkb-parsed.csv,llm2vec,McGill-NLP_LLM2Vec-Meta-Llama-3-8B-Instruct-mn...,0.175954,0.741229,0.724947,0.778858,0.587702,0.735252,0.658053,0.707258,0.586608,0.652269,479.219994,135692416
99,webkb-parsed.csv,llm2vec,McGill-NLP_LLM2Vec-Mistral-7B-Instruct-v2-mntp...,0.097515,0.716436,0.698620,0.760565,0.591510,0.729517,0.623665,0.681909,0.554618,0.628967,518.274609,135692416
100,webkb-parsed.csv,llm2vec,McGill-NLP_LLM2Vec-Mistral-7B-Instruct-v2-mntp...,0.108003,0.595790,0.738227,0.788880,0.611827,0.744922,0.660979,0.711210,0.598923,0.667003,518.035170,135692416


In [None]:
datasets = results_df['dataset_name'].unique()

for dataset in datasets:
    print(f"Dataset: {dataset}")
    display(results_df[results_df['dataset_name'] == dataset])

In [8]:
results_df.columns

Index(['dataset_name', 'model_type', 'model_name', 'fit_time', 'score_time',
       'test_accuracy', 'train_accuracy', 'test_precision', 'train_precision',
       'test_recall', 'train_recall', 'test_f1_score', 'train_f1_score',
       'embedding_generation_time', 'embeddings_size'],
      dtype='object')

In [10]:
import pdfkit

# Lista de datasets únicos
datasets = results_df['dataset_name'].unique()

columns = ['model_type', 'model_name', 'fit_time', 'score_time', 'test_accuracy',  'test_precision', 'test_recall', 'test_f1_score','embedding_generation_time', 'embeddings_size']

# HTML para o PDF consolidado
html_content = ""

for dataset in datasets:
    # Filtra o DataFrame por dataset
    filtered_df = results_df[results_df['dataset_name'] == dataset]
    filtered_df = filtered_df[columns]
    filtered_df = filtered_df.sort_values(by = "test_f1_score", ascending = False)
    
    # Adiciona título e conteúdo do DataFrame ao HTML
    html_content += f"<h2>Dataset: {dataset}</h2>"
    html_content += filtered_df.to_html(index=False)
    html_content += "<br><br>"  # Adiciona um espaço entre os datasets

# Gera o PDF consolidado
pdf_filename = "consolidated_report.pdf"
pdfkit.from_string(html_content, pdf_filename)
print(f"PDF gerado: {pdf_filename}")

PDF gerado: consolidated_report.pdf
