In [1]:
%cd ../..

/home/matheus/Desktop/Itens/Projetos/llm2vec-embeddings-classification


In [13]:
import os
import numpy as np
import pandas as pd
from src.core.utils import read_json

# Configura o Pandas para exibir todas as colunas
pd.set_option('display.max_columns', None)

def load_results_to_dataframe(base_path: str) -> pd.DataFrame:
    """
    Load results from JSON files into a pandas DataFrame.
    """
    results = []
    
    # Traverse the directory structure
    for dataset_name in os.listdir(base_path):
        dataset_path = os.path.join(base_path, dataset_name)
        if os.path.isdir(dataset_path):
            for model_type in os.listdir(dataset_path):
                model_type_path = os.path.join(dataset_path, model_type)
                
                if os.path.isdir(model_type_path):
                    for model_name in os.listdir(model_type_path):
                        model_name_path = os.path.join(model_type_path, model_name)
                        
                        # Define paths based on whether prompt_name is needed
                        if model_type != "bert":
                            subdirs = [os.path.join(model_name_path, prompt) for prompt in os.listdir(model_name_path)]
                        else:
                            subdirs = [model_name_path]
                        
                        # Process results.json files from determined paths
                        for subdir in subdirs:
                            for classifier in os.listdir(subdir):
                                classifier_path = os.path.join(subdir, classifier)
                                
                                # Check for the results.json in the classifier path
                                json_file_path = os.path.join(classifier_path, 'results.json')
                                
                                if os.path.isfile(json_file_path):
                                    result_data = read_json(json_file_path)

                                    keys_to_extract = ['mean_test_accuracy', 'mean_test_precision', 'mean_test_recall', 'mean_test_f1_score', 'embedding_generation_time', 'embedding_generation_size']
    
                                    # Extrai apenas as chaves especificadas
                                    result_data= {key: result_data.get(key) for key in keys_to_extract}
                                    
                                    # Add metadata to the result data
                                    result_data['dataset_name'] = dataset_name
                                    result_data['model_type'] = model_type
                                    result_data['model_name'] = model_name
                                    result_data['classifier'] = classifier
                                    
                                    # Add prompt_name if applicable
                                    if model_type != "bert":
                                        result_data['prompt_name'] = os.path.basename(subdir)
                                    else:
                                        result_data['prompt_name'] = None
                                    
                                    results.append(result_data)

    # Create a DataFrame from the results
    results_df = pd.DataFrame(results)
    # Specify the order of the columns
    columns_first = ['dataset_name', 'model_type', 'model_name', 'classifier']
    if 'prompt_name' in results_df.columns:
        columns_first.append('prompt_name')
    column_order = columns_first + [col for col in results_df.columns if col not in columns_first]
    results_df = results_df[column_order]
    
    return results_df



In [14]:
# Usage
base_path = 'results' 
results_df = load_results_to_dataframe(base_path)

print("DataFrame of Results:")
display(results_df)

# Save the DataFrame to a CSV file
results_df.to_csv(f'{base_path}/result_resume.csv', index=False)  # Include index=False to avoid saving the index as a column

DataFrame of Results:


Unnamed: 0,dataset_name,model_type,model_name,classifier,prompt_name,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1_score,embedding_generation_time,embedding_generation_size
0,SyskillWebert.csv,llama_cpp,lmstudio-community_Yi-1.5-9B-Chat-GGUF,knn,instruction_summary_prompt,0.757440,0.708869,0.715215,0.699426,19.917713,2872
1,SyskillWebert.csv,llama_cpp,lmstudio-community_Yi-1.5-9B-Chat-GGUF,knn,instruction_classification_prompt,0.739484,0.686043,0.687298,0.671230,20.202653,2872
2,SyskillWebert.csv,llama_cpp,lmstudio-community_Yi-1.5-9B-Chat-GGUF,knn,base_prompt,0.763501,0.725899,0.720255,0.706419,37.360954,2872
3,SyskillWebert.csv,llama_cpp,bartowski_Mistral-7B-Instruct-v0.3-GGUF,knn,instruction_summary_prompt,0.811443,0.782970,0.776402,0.767784,18.502734,2872
4,SyskillWebert.csv,llama_cpp,bartowski_Mistral-7B-Instruct-v0.3-GGUF,knn,instruction_classification_prompt,0.802488,0.766386,0.764351,0.757539,18.591208,2872
...,...,...,...,...,...,...,...,...,...,...,...
169,webkb-parsed.csv,llm2vec,McGill-NLP_LLM2Vec-Mistral-7B-Instruct-v2-mntp...,knn,instruction_classification_prompt,0.706833,0.592763,0.641611,0.591427,580.431798,128
170,webkb-parsed.csv,llm2vec,McGill-NLP_LLM2Vec-Mistral-7B-Instruct-v2-mntp...,knn,base_prompt,0.696933,0.574906,0.627988,0.578781,510.945833,128
171,webkb-parsed.csv,llm2vec,McGill-NLP_LLM2Vec-Meta-Llama-3-8B-Instruct-mn...,knn,instruction_summary_prompt,0.720598,0.603699,0.606670,0.567119,511.821210,128
172,webkb-parsed.csv,llm2vec,McGill-NLP_LLM2Vec-Meta-Llama-3-8B-Instruct-mn...,knn,instruction_classification_prompt,0.712991,0.675711,0.582438,0.558684,536.509338,128


In [17]:
datasets = results_df['dataset_name'].unique()

for dataset in datasets:
    print(f"Dataset: {dataset}")
    display(results_df[results_df['dataset_name'] == dataset].sort_values(by = "mean_test_f1_score", ascending = False))

Dataset: SyskillWebert.csv


Unnamed: 0,dataset_name,model_type,model_name,classifier,prompt_name,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1_score,embedding_generation_time,embedding_generation_size
14,SyskillWebert.csv,bert,sentence-transformers_all-MiniLM-L12-v2,knn,,0.922071,0.931067,0.908643,0.912636,3.686479,513152
13,SyskillWebert.csv,bert,sentence-transformers_all-MiniLM-L6-v2,knn,,0.918905,0.915803,0.905289,0.906802,3.242647,513152
15,SyskillWebert.csv,bert,sentence-transformers_all-mpnet-base-v2,knn,,0.910086,0.909914,0.895729,0.898432,6.093098,1026176
12,SyskillWebert.csv,bert,sentence-transformers_all-distilroberta-v1,knn,,0.901131,0.898916,0.88128,0.884424,4.694741,1026176
3,SyskillWebert.csv,llama_cpp,bartowski_Mistral-7B-Instruct-v0.3-GGUF,knn,instruction_summary_prompt,0.811443,0.78297,0.776402,0.767784,18.502734,2872
6,SyskillWebert.csv,llama_cpp,lmstudio-community_Llama-3.2-1B-Instruct-GGUF,knn,instruction_summary_prompt,0.808548,0.779025,0.772318,0.761905,12.975115,2872
10,SyskillWebert.csv,llama_cpp,SanctumAI_gemma-2-9b-it-GGUF,knn,instruction_classification_prompt,0.805382,0.764847,0.772149,0.761321,23.050879,2872
4,SyskillWebert.csv,llama_cpp,bartowski_Mistral-7B-Instruct-v0.3-GGUF,knn,instruction_classification_prompt,0.802488,0.766386,0.764351,0.757539,18.591208,2872
7,SyskillWebert.csv,llama_cpp,lmstudio-community_Llama-3.2-1B-Instruct-GGUF,knn,instruction_classification_prompt,0.802533,0.768764,0.764076,0.756371,7.863297,2872
8,SyskillWebert.csv,llama_cpp,lmstudio-community_Llama-3.2-1B-Instruct-GGUF,knn,base_prompt,0.802578,0.770141,0.764626,0.753824,7.890748,2872


Dataset: Dmoz-Science.csv


Unnamed: 0,dataset_name,model_type,model_name,classifier,prompt_name,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1_score,embedding_generation_time,embedding_generation_size
49,Dmoz-Science.csv,llm2vec,McGill-NLP_LLM2Vec-Meta-Llama-3-8B-Instruct-mn...,knn,base_prompt,0.845333,0.851068,0.845333,0.842068,100.084914,128
32,Dmoz-Science.csv,llm2vec,McGill-NLP_LLM2Vec-Sheared-LLaMA-mntp-supervised,knn,instruction_summary_prompt,0.826333,0.835726,0.826333,0.821697,30.165472,128
43,Dmoz-Science.csv,llm2vec,McGill-NLP_LLM2Vec-Mistral-7B-Instruct-v2-mntp...,knn,base_prompt,0.819,0.829449,0.819,0.813296,141.51906,128
45,Dmoz-Science.csv,llm2vec,McGill-NLP_LLM2Vec-Mistral-7B-Instruct-v2-mntp...,knn,instruction_classification_prompt,0.818333,0.824975,0.818333,0.813207,148.700603,128
31,Dmoz-Science.csv,bert,sentence-transformers_all-mpnet-base-v2,knn,,0.815833,0.821017,0.815833,0.812649,8.183289,18432128
34,Dmoz-Science.csv,llm2vec,McGill-NLP_LLM2Vec-Sheared-LLaMA-mntp-supervised,knn,base_prompt,0.816333,0.825889,0.816333,0.810823,44.37809,128
42,Dmoz-Science.csv,llm2vec,McGill-NLP_LLM2Vec-Mistral-7B-Instruct-v2-mntp...,knn,instruction_classification_prompt,0.813,0.824832,0.813,0.807733,149.389468,128
47,Dmoz-Science.csv,llm2vec,McGill-NLP_LLM2Vec-Meta-Llama-3-8B-Instruct-mn...,knn,instruction_summary_prompt,0.801333,0.810236,0.801333,0.794887,88.715101,128
40,Dmoz-Science.csv,llm2vec,McGill-NLP_LLM2Vec-Meta-Llama-3-8B-Instruct-mn...,knn,base_prompt,0.799167,0.80329,0.799167,0.794855,53.488042,128
38,Dmoz-Science.csv,llm2vec,McGill-NLP_LLM2Vec-Meta-Llama-3-8B-Instruct-mn...,knn,instruction_summary_prompt,0.7955,0.800411,0.7955,0.791738,88.544759,128


Dataset: Industry Sector.csv


Unnamed: 0,dataset_name,model_type,model_name,classifier,prompt_name,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1_score,embedding_generation_time,embedding_generation_size
83,Industry Sector.csv,llm2vec,McGill-NLP_LLM2Vec-Meta-Llama-3-8B-Instruct-mn...,knn,base_prompt,0.927074,0.932435,0.920967,0.925592,566.160272,128
77,Industry Sector.csv,llm2vec,McGill-NLP_LLM2Vec-Mistral-7B-Instruct-v2-mntp...,knn,base_prompt,0.918454,0.927758,0.910506,0.918253,600.084488,128
76,Industry Sector.csv,llm2vec,McGill-NLP_LLM2Vec-Mistral-7B-Instruct-v2-mntp...,knn,instruction_classification_prompt,0.905296,0.910395,0.896863,0.902073,670.947782,128
81,Industry Sector.csv,llm2vec,McGill-NLP_LLM2Vec-Meta-Llama-3-8B-Instruct-mn...,knn,instruction_summary_prompt,0.899286,0.908949,0.88878,0.897444,577.014513,128
82,Industry Sector.csv,llm2vec,McGill-NLP_LLM2Vec-Meta-Llama-3-8B-Instruct-mn...,knn,instruction_classification_prompt,0.890213,0.90775,0.87436,0.889181,623.375018,128
66,Industry Sector.csv,llm2vec,McGill-NLP_LLM2Vec-Sheared-LLaMA-mntp-supervised,knn,instruction_summary_prompt,0.887831,0.902754,0.869027,0.883949,171.938547,128
68,Industry Sector.csv,llm2vec,McGill-NLP_LLM2Vec-Sheared-LLaMA-mntp-supervised,knn,base_prompt,0.884768,0.898902,0.869224,0.882456,179.534644,128
75,Industry Sector.csv,llm2vec,McGill-NLP_LLM2Vec-Mistral-7B-Instruct-v2-mntp...,knn,instruction_summary_prompt,0.887149,0.883309,0.871519,0.875129,618.219903,128
67,Industry Sector.csv,llm2vec,McGill-NLP_LLM2Vec-Sheared-LLaMA-mntp-supervised,knn,instruction_classification_prompt,0.87422,0.887417,0.854807,0.867747,188.215892,128
64,Industry Sector.csv,bert,sentence-transformers_all-MiniLM-L12-v2,knn,,0.857775,0.877562,0.841172,0.856885,11.493757,13543040


Dataset: review_polarity.csv


Unnamed: 0,dataset_name,model_type,model_name,classifier,prompt_name,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1_score,embedding_generation_time,embedding_generation_size
116,review_polarity.csv,llm2vec,McGill-NLP_LLM2Vec-Meta-Llama-3-8B-Instruct-mn...,knn,instruction_classification_prompt,0.811,0.819453,0.811,0.809688,159.833298,128
110,review_polarity.csv,llm2vec,McGill-NLP_LLM2Vec-Mistral-7B-Instruct-v2-mntp...,knn,instruction_classification_prompt,0.8075,0.834309,0.8075,0.80333,160.81822,128
109,review_polarity.csv,llm2vec,McGill-NLP_LLM2Vec-Mistral-7B-Instruct-v2-mntp...,knn,instruction_summary_prompt,0.8045,0.824316,0.8045,0.801344,165.681597,128
115,review_polarity.csv,llm2vec,McGill-NLP_LLM2Vec-Meta-Llama-3-8B-Instruct-mn...,knn,instruction_summary_prompt,0.758,0.763702,0.758,0.75665,224.535201,128
88,review_polarity.csv,llama_cpp,bartowski_Mistral-7B-Instruct-v0.3-GGUF,knn,instruction_classification_prompt,0.758,0.775615,0.758,0.754089,120.253291,16184
94,review_polarity.csv,llama_cpp,SanctumAI_gemma-2-9b-it-GGUF,knn,instruction_classification_prompt,0.758,0.77775,0.758,0.753518,152.858995,16184
95,review_polarity.csv,llama_cpp,SanctumAI_gemma-2-9b-it-GGUF,knn,base_prompt,0.757,0.774736,0.757,0.752927,199.012429,16184
89,review_polarity.csv,llama_cpp,bartowski_Mistral-7B-Instruct-v0.3-GGUF,knn,base_prompt,0.758,0.786394,0.758,0.751731,141.925606,16184
111,review_polarity.csv,llm2vec,McGill-NLP_LLM2Vec-Mistral-7B-Instruct-v2-mntp...,knn,base_prompt,0.7505,0.762974,0.7505,0.74741,188.478727,128
93,review_polarity.csv,llama_cpp,SanctumAI_gemma-2-9b-it-GGUF,knn,instruction_summary_prompt,0.7505,0.769331,0.7505,0.745904,152.876014,16184


Dataset: re8.csv


Unnamed: 0,dataset_name,model_type,model_name,classifier,prompt_name,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1_score,embedding_generation_time,embedding_generation_size
121,re8.csv,bert,sentence-transformers_all-mpnet-base-v2,knn,,0.966771,0.93418,0.912921,0.921816,17.896263,23574656
120,re8.csv,bert,sentence-transformers_all-MiniLM-L12-v2,knn,,0.958301,0.919397,0.901162,0.908951,7.901442,11787392
122,re8.csv,llm2vec,McGill-NLP_LLM2Vec-Sheared-LLaMA-mntp-supervised,knn,instruction_summary_prompt,0.964296,0.922724,0.894898,0.906457,74.475271,128
119,re8.csv,bert,sentence-transformers_all-MiniLM-L6-v2,knn,,0.951394,0.91986,0.892406,0.903772,7.290929,11787392
124,re8.csv,llm2vec,McGill-NLP_LLM2Vec-Sheared-LLaMA-mntp-supervised,knn,base_prompt,0.961038,0.920731,0.891082,0.902881,83.066637,128
118,re8.csv,bert,sentence-transformers_all-distilroberta-v1,knn,,0.956607,0.917567,0.874305,0.892011,13.268178,23574656
123,re8.csv,llm2vec,McGill-NLP_LLM2Vec-Sheared-LLaMA-mntp-supervised,knn,instruction_classification_prompt,0.963644,0.922482,0.860897,0.883722,89.451818,128
125,re8.csv,llm2vec,McGill-NLP_LLM2Vec-Sheared-LLaMA-mntp-unsup-si...,knn,instruction_summary_prompt,0.955955,0.909238,0.863052,0.88033,73.774339,128
127,re8.csv,llm2vec,McGill-NLP_LLM2Vec-Sheared-LLaMA-mntp-unsup-si...,knn,base_prompt,0.956607,0.916774,0.856198,0.877776,60.174077,128
126,re8.csv,llm2vec,McGill-NLP_LLM2Vec-Sheared-LLaMA-mntp-unsup-si...,knn,instruction_classification_prompt,0.956216,0.914719,0.856806,0.877712,82.459512,128


Dataset: NSF.csv


Unnamed: 0,dataset_name,model_type,model_name,classifier,prompt_name,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1_score,embedding_generation_time,embedding_generation_size
139,NSF.csv,bert,sentence-transformers_all-mpnet-base-v2,knn,,0.876093,0.881889,0.873431,0.876192,4.746066,32329856
138,NSF.csv,bert,sentence-transformers_all-MiniLM-L12-v2,knn,,0.868016,0.877737,0.86132,0.867218,3.737107,16164992
136,NSF.csv,bert,sentence-transformers_all-distilroberta-v1,knn,,0.86279,0.873874,0.855803,0.862538,3.65145,32329856
137,NSF.csv,bert,sentence-transformers_all-MiniLM-L6-v2,knn,,0.863075,0.869609,0.856151,0.86052,3.254418,16164992
135,NSF.csv,llama_cpp,SanctumAI_gemma-2-9b-it-GGUF,knn,base_prompt,0.830293,0.854868,0.81253,0.828544,332.487096,85176
134,NSF.csv,llama_cpp,SanctumAI_gemma-2-9b-it-GGUF,knn,instruction_classification_prompt,0.83552,0.848063,0.818194,0.82794,413.409625,85176
133,NSF.csv,llama_cpp,SanctumAI_gemma-2-9b-it-GGUF,knn,instruction_summary_prompt,0.830673,0.847927,0.809577,0.824066,384.820848,85176
132,NSF.csv,llama_cpp,lmstudio-community_Llama-3.2-1B-Instruct-GGUF,knn,base_prompt,0.764159,0.786573,0.728902,0.749448,173.259291,85176
130,NSF.csv,llama_cpp,lmstudio-community_Llama-3.2-1B-Instruct-GGUF,knn,instruction_summary_prompt,0.761688,0.780333,0.726843,0.74579,167.699419,85176
129,NSF.csv,llama_cpp,bartowski_Mistral-7B-Instruct-v0.3-GGUF,knn,base_prompt,0.760357,0.778469,0.725451,0.743237,298.389728,85176


Dataset: webkb-parsed.csv


Unnamed: 0,dataset_name,model_type,model_name,classifier,prompt_name,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1_score,embedding_generation_time,embedding_generation_size
144,webkb-parsed.csv,llama_cpp,bartowski_Mistral-7B-Instruct-v0.3-GGUF,knn,instruction_classification_prompt,0.769502,0.69509,0.68451,0.668342,438.750308,67224
143,webkb-parsed.csv,llama_cpp,bartowski_Mistral-7B-Instruct-v0.3-GGUF,knn,instruction_summary_prompt,0.768294,0.692435,0.686981,0.664842,446.703874,67224
150,webkb-parsed.csv,llama_cpp,SanctumAI_gemma-2-9b-it-GGUF,knn,instruction_classification_prompt,0.77264,0.694771,0.671986,0.662053,551.262365,67224
145,webkb-parsed.csv,llama_cpp,bartowski_Mistral-7B-Instruct-v0.3-GGUF,knn,base_prompt,0.755497,0.675268,0.681675,0.656334,421.376754,67224
149,webkb-parsed.csv,llama_cpp,SanctumAI_gemma-2-9b-it-GGUF,knn,instruction_summary_prompt,0.763826,0.690522,0.672353,0.656296,543.63926,67224
151,webkb-parsed.csv,llama_cpp,SanctumAI_gemma-2-9b-it-GGUF,knn,base_prompt,0.75779,0.685572,0.665394,0.65403,573.430826,67224
148,webkb-parsed.csv,llama_cpp,lmstudio-community_Llama-3.2-1B-Instruct-GGUF,knn,base_prompt,0.740523,0.665755,0.652875,0.63383,174.230829,67224
141,webkb-parsed.csv,llama_cpp,lmstudio-community_Yi-1.5-9B-Chat-GGUF,knn,instruction_classification_prompt,0.746198,0.672038,0.633611,0.619185,473.747154,67224
142,webkb-parsed.csv,llama_cpp,lmstudio-community_Yi-1.5-9B-Chat-GGUF,knn,base_prompt,0.73219,0.625857,0.640304,0.618607,468.696706,67224
146,webkb-parsed.csv,llama_cpp,lmstudio-community_Llama-3.2-1B-Instruct-GGUF,knn,instruction_summary_prompt,0.73183,0.639256,0.639962,0.617037,173.794273,67224


In [8]:
results_df.columns

Index(['dataset_name', 'model_type', 'model_name', 'fit_time', 'score_time',
       'test_accuracy', 'train_accuracy', 'test_precision', 'train_precision',
       'test_recall', 'train_recall', 'test_f1_score', 'train_f1_score',
       'embedding_generation_time', 'embeddings_size'],
      dtype='object')

In [10]:
import pdfkit

# Lista de datasets únicos
datasets = results_df['dataset_name'].unique()

columns = ['model_type', 'model_name', 'fit_time', 'score_time', 'test_accuracy',  'test_precision', 'test_recall', 'test_f1_score','embedding_generation_time', 'embeddings_size']

# HTML para o PDF consolidado
html_content = ""

for dataset in datasets:
    # Filtra o DataFrame por dataset
    filtered_df = results_df[results_df['dataset_name'] == dataset]
    filtered_df = filtered_df[columns]
    filtered_df = filtered_df.sort_values(by = "test_f1_score", ascending = False)
    
    # Adiciona título e conteúdo do DataFrame ao HTML
    html_content += f"<h2>Dataset: {dataset}</h2>"
    html_content += filtered_df.to_html(index=False)
    html_content += "<br><br>"  # Adiciona um espaço entre os datasets

# Gera o PDF consolidado
pdf_filename = "consolidated_report.pdf"
pdfkit.from_string(html_content, pdf_filename)
print(f"PDF gerado: {pdf_filename}")

PDF gerado: consolidated_report.pdf
