In [1]:
import config

import logging
import numpy as np
import pandas as pd

logger = logging.getLogger(__name__)

2023-05-30 17:45:18,560 - INFO     | config     | Loading environment variables


In [2]:
import glob
import pickle
import re

files = glob.glob('embeddings/*_test.pkl')
embeddings_data = {}

for file in files:
    logger.info(f'Loading {file}')
    match_obj = re.search(r'/(?P<output>[^_/]+)_', file)
    filename = match_obj.group('output')
    with open(file, 'rb') as fp:
        dict = pickle.load(fp)
        embeddings_data[filename] = dict

2023-05-30 17:45:18,906 - INFO     | __main__   | Loading embeddings/glove_embeddings_test.pkl
2023-05-30 17:45:18,909 - INFO     | __main__   | Loading embeddings/mpnet_embeddings_test.pkl
2023-05-30 17:45:18,911 - INFO     | __main__   | Loading embeddings/distil_embeddings_test.pkl
2023-05-30 17:45:18,914 - INFO     | __main__   | Loading embeddings/wiki_embeddings_test.pkl
2023-05-30 17:45:18,916 - INFO     | __main__   | Loading embeddings/w2v_embeddings_test.pkl


# Clustering with PCA

In [3]:
from helpers.clustering_helpers import clustering_scores, dbscan_loop, kmeans_loop
from helpers.data_viz import plot_clusters

In [4]:
%%time
dbscan_overall_results = {}
for embeddings_model, embeddings_list in embeddings_data.items():
    logger.info(f'{embeddings_model.upper()}')
    for llm, embeddings in embeddings_list.items():
        logger.info(f'---{llm.upper()}')
        score, components, samples = dbscan_loop(data=embeddings,
                                                 n_components_space=[16, 32, 64, 100],
                                                 min_samples_space = [5, 10, 15, 20])
        
        dbscan_overall_results[(embeddings_model, llm, str(components), str(samples))] = score

2023-05-30 17:45:19,537 - INFO     | __main__   | GLOVE
2023-05-30 17:45:19,538 - INFO     | __main__   | ---BLOOM
2023-05-30 17:45:21,253 - INFO     | __main__   | ---ALPACA_3B
2023-05-30 17:45:21,799 - INFO     | __main__   | ---ALPACA_770M
2023-05-30 17:45:22,031 - INFO     | __main__   | ---LLAMA_13B
2023-05-30 17:45:22,272 - INFO     | __main__   | ---GPT4ALL
2023-05-30 17:45:22,524 - INFO     | __main__   | ---LLAMA_7B
2023-05-30 17:45:23,883 - INFO     | __main__   | MPNET
2023-05-30 17:45:23,884 - INFO     | __main__   | ---BLOOM
2023-05-30 17:45:28,904 - INFO     | __main__   | ---ALPACA_3B
2023-05-30 17:45:33,868 - INFO     | __main__   | ---ALPACA_770M
2023-05-30 17:45:38,687 - INFO     | __main__   | ---LLAMA_13B
2023-05-30 17:45:42,755 - INFO     | __main__   | ---GPT4ALL
2023-05-30 17:45:46,985 - INFO     | __main__   | ---LLAMA_7B
2023-05-30 17:45:51,654 - INFO     | __main__   | DISTIL
2023-05-30 17:45:51,655 - INFO     | __main__   | ---BLOOM
2023-05-30 17:45:55,820 - 

CPU times: user 7min 28s, sys: 2min 15s, total: 9min 44s
Wall time: 1min 15s


In [8]:
%%time
kmeans_overall_results = {}
for embeddings_model, embeddings_list in embeddings_data.items():
    logger.info(f'{embeddings_model.upper()}')
    for llm, embeddings in embeddings_list.items():
        logger.info(f'---{llm.upper()}')
        score, components, iters, n_cluster, tols = kmeans_loop(data=embeddings,
                                                                n_components_space=[16, 32, 64, 100],
                                                                n_clusters_space = [2, 3, 4],
                                                                max_iter_space = [100, 250],
                                                                tol_space = [1e-4, 1e-3, 1e-2, 1e-2]
        kmeans_overall_results[(embeddings_model, llm, str(components), str(iters), str(n_cluster), str(tols))] = score

2023-05-30 17:52:37,430 - INFO     | __main__   | GLOVE
2023-05-30 17:52:37,434 - INFO     | __main__   | ---BLOOM
2023-05-30 17:52:50,181 - INFO     | __main__   | ---ALPACA_3B
2023-05-30 17:53:06,176 - INFO     | __main__   | ---ALPACA_770M
2023-05-30 17:53:19,649 - INFO     | __main__   | ---LLAMA_13B
2023-05-30 17:53:34,335 - INFO     | __main__   | ---GPT4ALL
2023-05-30 17:53:41,421 - INFO     | __main__   | ---LLAMA_7B
2023-05-30 17:53:55,922 - INFO     | __main__   | MPNET
2023-05-30 17:53:55,923 - INFO     | __main__   | ---BLOOM
2023-05-30 17:54:32,120 - INFO     | __main__   | ---ALPACA_3B
2023-05-30 17:55:06,694 - INFO     | __main__   | ---ALPACA_770M
2023-05-30 17:55:43,173 - INFO     | __main__   | ---LLAMA_13B
2023-05-30 17:56:16,264 - INFO     | __main__   | ---GPT4ALL
2023-05-30 17:56:48,685 - INFO     | __main__   | ---LLAMA_7B
2023-05-30 17:57:24,198 - INFO     | __main__   | DISTIL
2023-05-30 17:57:24,199 - INFO     | __main__   | ---BLOOM
2023-05-30 17:58:05,188 - 

CPU times: user 1h 17min 57s, sys: 12min 21s, total: 1h 30min 18s
Wall time: 11min 39s


In [9]:
dbscan_df = pd.DataFrame.from_dict(dbscan_overall_results, orient='index', columns=['Score'])
dbscan_df.index = pd.MultiIndex.from_tuples(dbscan_df.index, names=['EmbeddingsModel', 'LLM', 'PCA', 'MIN_SAMPLES'])

# Display the DataFrame
display(dbscan_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Score
EmbeddingsModel,LLM,PCA,MIN_SAMPLES,Unnamed: 4_level_1
glove,bloom,16,5,0.705346
glove,alpaca_3b,16,5,-0.095829
glove,alpaca_770m,16,5,-inf
glove,llama_13b,16,5,-inf
glove,gpt4all,16,5,-inf
glove,llama_7b,16,5,0.252066
mpnet,bloom,16,5,0.752365
mpnet,alpaca_3b,16,5,0.353684
mpnet,alpaca_770m,16,5,0.272236
mpnet,llama_13b,16,5,0.305975


In [10]:
kmeans_df = pd.DataFrame.from_dict(kmeans_overall_results, orient='index', columns=['Score'])
kmeans_df.index = pd.MultiIndex.from_tuples(kmeans_df.index, names=['EmbeddingsModel',
                                                                    'LLM',
                                                                    'PCA',
                                                                    'MAX_ITERS',
                                                                    'CLUSTERS',
                                                                    'TOL'])

# Display the DataFrame
display(kmeans_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Score
EmbeddingsModel,LLM,PCA,MAX_ITERS,CLUSTERS,TOL,Unnamed: 6_level_1
glove,bloom,16,100,3,0.0001,0.757382
glove,alpaca_3b,16,100,2,0.0001,0.144175
glove,alpaca_770m,64,100,2,0.0001,0.463143
glove,llama_13b,64,100,2,0.0001,0.131586
glove,gpt4all,32,100,2,0.0001,0.130505
glove,llama_7b,16,100,2,0.0001,0.362397
mpnet,bloom,16,100,2,0.0001,0.779275
mpnet,alpaca_3b,16,100,2,0.0001,0.206025
mpnet,alpaca_770m,32,100,3,0.00505,0.170061
mpnet,llama_13b,64,100,3,0.0001,0.115189
