In [1]:
import config

import logging
import numpy as np
import pandas as pd

logger = logging.getLogger(__name__)

2023-10-03 08:34:51,020 - INFO     | config     | Loading environment variables


In [2]:
import glob
import pickle
import re

files = glob.glob('embeddings/*_test.pkl')
embeddings_data = {}

for file in files:
    logger.info(f'Loading {file}')
    match_obj = re.search(r'/(?P<output>[^_/]+)_', file)
    filename = match_obj.group('output')
    with open(file, 'rb') as fp:
        dict = pickle.load(fp)
        embeddings_data[filename] = dict

2023-05-30 17:45:18,906 - INFO     | __main__   | Loading embeddings/glove_embeddings_test.pkl
2023-05-30 17:45:18,909 - INFO     | __main__   | Loading embeddings/mpnet_embeddings_test.pkl
2023-05-30 17:45:18,911 - INFO     | __main__   | Loading embeddings/distil_embeddings_test.pkl
2023-05-30 17:45:18,914 - INFO     | __main__   | Loading embeddings/wiki_embeddings_test.pkl
2023-05-30 17:45:18,916 - INFO     | __main__   | Loading embeddings/w2v_embeddings_test.pkl


# Clustering with PCA

In [4]:
from helpers.clustering_helpers import dbscan_loop, kmeans_loop

In [4]:
%%time
dbscan_overall_results = {}
for embeddings_model, embeddings_list in embeddings_data.items():
    logger.info(f'{embeddings_model.upper()}')
    for llm, embeddings in embeddings_list.items():
        logger.info(f'---{llm.upper()}')
        score, components, samples = dbscan_loop(data=embeddings,
                                                 n_components_space=[2, 4, 8, 16, 32, 64, 100],
                                                 min_samples_space = [5, 10, 15, 20])
        
        dbscan_overall_results[(embeddings_model, llm, str(components), str(samples))] = score

2023-05-30 17:45:19,537 - INFO     | __main__   | GLOVE
2023-05-30 17:45:19,538 - INFO     | __main__   | ---BLOOM
2023-05-30 17:45:21,253 - INFO     | __main__   | ---ALPACA_3B
2023-05-30 17:45:21,799 - INFO     | __main__   | ---ALPACA_770M
2023-05-30 17:45:22,031 - INFO     | __main__   | ---LLAMA_13B
2023-05-30 17:45:22,272 - INFO     | __main__   | ---GPT4ALL
2023-05-30 17:45:22,524 - INFO     | __main__   | ---LLAMA_7B
2023-05-30 17:45:23,883 - INFO     | __main__   | MPNET
2023-05-30 17:45:23,884 - INFO     | __main__   | ---BLOOM
2023-05-30 17:45:28,904 - INFO     | __main__   | ---ALPACA_3B
2023-05-30 17:45:33,868 - INFO     | __main__   | ---ALPACA_770M
2023-05-30 17:45:38,687 - INFO     | __main__   | ---LLAMA_13B
2023-05-30 17:45:42,755 - INFO     | __main__   | ---GPT4ALL
2023-05-30 17:45:46,985 - INFO     | __main__   | ---LLAMA_7B
2023-05-30 17:45:51,654 - INFO     | __main__   | DISTIL
2023-05-30 17:45:51,655 - INFO     | __main__   | ---BLOOM
2023-05-30 17:45:55,820 - 

CPU times: user 7min 28s, sys: 2min 15s, total: 9min 44s
Wall time: 1min 15s


In [8]:
%%time
kmeans_overall_results = {}
for embeddings_model, embeddings_list in embeddings_data.items():
    logger.info(f'{embeddings_model.upper()}')
    for llm, embeddings in embeddings_list.items():
        logger.info(f'---{llm.upper()}')
        score, components, iters, n_cluster, tols = kmeans_loop(data=embeddings,
                                                                n_components_space=[2, 4, 8, 16, 32, 64, 100],
                                                                n_clusters_space = [2, 3, 4, 5],
                                                                max_iter_space = [100, 250, 500],
                                                                tol_space = [1e-4, 1e-3, 1e-2, 1e-2]
        kmeans_overall_results[(embeddings_model, llm, str(components), str(iters), str(n_cluster), str(tols))] = score

2023-05-30 17:52:37,430 - INFO     | __main__   | GLOVE
2023-05-30 17:52:37,434 - INFO     | __main__   | ---BLOOM
2023-05-30 17:52:50,181 - INFO     | __main__   | ---ALPACA_3B
2023-05-30 17:53:06,176 - INFO     | __main__   | ---ALPACA_770M
2023-05-30 17:53:19,649 - INFO     | __main__   | ---LLAMA_13B
2023-05-30 17:53:34,335 - INFO     | __main__   | ---GPT4ALL
2023-05-30 17:53:41,421 - INFO     | __main__   | ---LLAMA_7B
2023-05-30 17:53:55,922 - INFO     | __main__   | MPNET
2023-05-30 17:53:55,923 - INFO     | __main__   | ---BLOOM
2023-05-30 17:54:32,120 - INFO     | __main__   | ---ALPACA_3B
2023-05-30 17:55:06,694 - INFO     | __main__   | ---ALPACA_770M
2023-05-30 17:55:43,173 - INFO     | __main__   | ---LLAMA_13B
2023-05-30 17:56:16,264 - INFO     | __main__   | ---GPT4ALL
2023-05-30 17:56:48,685 - INFO     | __main__   | ---LLAMA_7B
2023-05-30 17:57:24,198 - INFO     | __main__   | DISTIL
2023-05-30 17:57:24,199 - INFO     | __main__   | ---BLOOM
2023-05-30 17:58:05,188 - 

CPU times: user 1h 17min 57s, sys: 12min 21s, total: 1h 30min 18s
Wall time: 11min 39s


In [9]:
dbscan_df = pd.DataFrame.from_dict(dbscan_overall_results, orient='index', columns=['Score'])
dbscan_df.index = pd.MultiIndex.from_tuples(dbscan_df.index, names=['EmbeddingsModel', 'LLM', 'PCA', 'MIN_SAMPLES'])

# Display the DataFrame
display(dbscan_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Score
EmbeddingsModel,LLM,PCA,MIN_SAMPLES,Unnamed: 4_level_1
glove,bloom,16,5,0.705346
glove,alpaca_3b,16,5,-0.095829
glove,alpaca_770m,16,5,-inf
glove,llama_13b,16,5,-inf
glove,gpt4all,16,5,-inf
glove,llama_7b,16,5,0.252066
mpnet,bloom,16,5,0.752365
mpnet,alpaca_3b,16,5,0.353684
mpnet,alpaca_770m,16,5,0.272236
mpnet,llama_13b,16,5,0.305975


In [10]:
kmeans_df = pd.DataFrame.from_dict(kmeans_overall_results, orient='index', columns=['Score'])
kmeans_df.index = pd.MultiIndex.from_tuples(kmeans_df.index, names=['EmbeddingsModel',
                                                                    'LLM',
                                                                    'PCA',
                                                                    'MAX_ITERS',
                                                                    'CLUSTERS',
                                                                    'TOL'])

# Display the DataFrame
display(kmeans_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Score
EmbeddingsModel,LLM,PCA,MAX_ITERS,CLUSTERS,TOL,Unnamed: 6_level_1
glove,bloom,16,100,3,0.0001,0.757382
glove,alpaca_3b,16,100,2,0.0001,0.144175
glove,alpaca_770m,64,100,2,0.0001,0.463143
glove,llama_13b,64,100,2,0.0001,0.131586
glove,gpt4all,32,100,2,0.0001,0.130505
glove,llama_7b,16,100,2,0.0001,0.362397
mpnet,bloom,16,100,2,0.0001,0.779275
mpnet,alpaca_3b,16,100,2,0.0001,0.206025
mpnet,alpaca_770m,32,100,3,0.00505,0.170061
mpnet,llama_13b,64,100,3,0.0001,0.115189


## New Testing

In [1]:
import config

import logging
import numpy as np
import pandas as pd


logger = logging.getLogger(__name__)

2023-10-03 23:06:07,702 - INFO     | config     | Loading environment variables


In [2]:
from models.clustering import ClusteringModel
from helpers.clustering_helpers import dbscan_loop, kmeans_loop, get_best_scores

import pickle
with open(f'embeddings/2000T_embeddings_202375.pkl', 'rb') as f:
        embeddings_dict = pickle.load(f)

df = pd.read_parquet('full_data_202375.parquet')
df_new = df.drop(columns=['gpt-3.5-turbo_kmeans',
                          'alpaca_kmeans',
                          'gpt4all_kmeans',
                          'gpt-3.5-turbo_dbscan',
                          'alpaca_dbscan'])

# KMeans

In [3]:
# kmeans

best_results_kmeans = {}
for llm, embeddings in embeddings_dict.items():
    logger.info(f'{llm.upper()}')
    results = kmeans_loop(data=embeddings,
                          n_components_space=[None, 0.80, 0.90, 0.95, 0.99],
                          n_clusters_space = list(np.linspace(1, 10, 10, dtype='int')),
                          max_iter_space = [250, 500])
    
    best_results_kmeans[llm] = get_best_scores(results=results, model_name='kmeans')
    best_results_kmeans[llm]['original_emb'] = embeddings

2023-10-03 23:06:08,455 - INFO     | __main__   | GPT-3.5-TURBO


KeyboardInterrupt: 

In [None]:
pd.DataFrame.from_dict(best_results_kmeans, orient='index')

Unnamed: 0,score,n_components,actual_components,max_iter,n_clusters,original_emb
gpt-3.5-turbo,0.1199,0.9,142,250,2,"[[-0.03887094, -0.11050342, -0.031814046, 0.02..."
alpaca,0.114074,0.99,375,250,3,"[[0.011134546, -0.08676946, -0.03891137, 0.060..."
gpt4all,0.155368,,768,250,2,"[[-0.02282622, -0.08779901, -0.021268817, -0.0..."


In [None]:
final_kmeans_emb = {}
kmeans_clusters = []
for llm, params in best_results_kmeans.items():
    logger.info(llm.upper())
    kmeans = ClusteringModel(model_name='kmeans',
                             n_init='auto',
                             max_iter=params['max_iter'],
                             n_clusters=params['n_clusters'])
    kmeans.fit_predict(embeddings=params['original_emb'],
                       pca_flag=True,
                       n_components=params['n_components'])
    kmeans_clusters.append(kmeans.clusters)
    
    final_kmeans_emb[llm] = (params['original_emb'], kmeans.embeddings)

2023-10-03 22:36:09,430 - INFO     | __main__   | GPT-3.5-TURBO
2023-10-03 22:36:10,054 - INFO     | __main__   | ALPACA
2023-10-03 22:36:11,121 - INFO     | __main__   | GPT4ALL


In [None]:
df_new['gpt-3.5-turbo_kmeans'] = kmeans_clusters[0]
df_new['alpaca_kmeans'] = kmeans_clusters[1]
df_new['gpt4all_kmeans'] = kmeans_clusters[2]

# DBSCAN

In [None]:
# dbscan

best_results_dbscan = {}
for llm, embeddings in embeddings_dict.items():
    logger.info(f'{llm.upper()}')
    results = dbscan_loop(data=embeddings,
                          n_components_space=[None, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99],
                          eps_space= [0.5, 0.6, 0.7, 0.8, 0.9],
                          min_samples_space=[5, 25, 50, 75, 100])
    
    best_results_dbscan[llm] = get_best_scores(results=results, model_name='dbscan')
    best_results_dbscan[llm]['original_emb'] = embeddings

2023-10-03 22:53:34,964 - INFO     | __main__   | GPT-3.5-TURBO
2023-10-03 22:53:35,967 - INFO     | helpers.clustering_helpers | Found them
2023-10-03 22:54:59,996 - INFO     | helpers.clustering_helpers | Found them
2023-10-03 22:55:21,684 - INFO     | helpers.clustering_helpers | Found them
2023-10-03 22:55:45,456 - INFO     | helpers.clustering_helpers | Found them
2023-10-03 22:56:08,768 - INFO     | helpers.clustering_helpers | Best Score: 0.09637953341007233
2023-10-03 22:56:08,779 - INFO     | __main__   | ALPACA
2023-10-03 22:56:09,773 - INFO     | helpers.clustering_helpers | Found them
2023-10-03 22:56:10,766 - INFO     | helpers.clustering_helpers | Found them
2023-10-03 22:57:26,684 - INFO     | helpers.clustering_helpers | Found them
2023-10-03 22:57:53,780 - INFO     | helpers.clustering_helpers | Found them
2023-10-03 22:58:19,847 - INFO     | helpers.clustering_helpers | Found them
2023-10-03 22:58:20,875 - INFO     | helpers.clustering_helpers | Found them
2023-10-03 

In [None]:
pd.DataFrame.from_dict(best_results_dbscan, orient='index')

Unnamed: 0,score,n_components,actual_components,min_samples,eps,original_emb
gpt-3.5-turbo,0.09638,0.9,142.0,5.0,0.5,"[[-0.03887094, -0.11050342, -0.031814046, 0.02..."
alpaca,0.149651,0.95,203.0,5.0,0.6,"[[0.011134546, -0.08676946, -0.03891137, 0.060..."
gpt4all,,,,,,"[[-0.02282622, -0.08779901, -0.021268817, -0.0..."


In [None]:
final_dbscan_emb = {}
dbscan_clusters = []
for llm, params in best_results_dbscan.items():
    logger.info(llm.upper())
    try:
        dbscan = ClusteringModel(model_name='dbscan',
                                 eps=params['eps'],
                                 min_samples=params['min_samples'],
                                 metric='euclidean')
        dbscan.fit_predict(embeddings=params['original_emb'],
                           pca_flag=True,
                           n_components=params['n_components'])
        dbscan_clusters.append(dbscan.clusters)
        
        final_dbscan_emb[llm] = (params['original_emb'], dbscan.embeddings)
    except Exception as e:
        logger.warning(f'Not Clusters Found for {llm.upper()}')

2023-10-03 23:05:08,142 - INFO     | __main__   | GPT-3.5-TURBO
2023-10-03 23:05:08,708 - INFO     | __main__   | ALPACA
2023-10-03 23:05:09,507 - INFO     | __main__   | GPT4ALL


In [None]:
df_new['gpt-3.5-turbo_dbscan'] = dbscan_clusters[0]
df_new['alpaca_dbscan'] = dbscan_clusters[1]

# Analysis

{'gpt-3.5-turbo': None}