In [4]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
from hdbscan import HDBSCAN
from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain.embeddings import HuggingFaceEmbeddings

from finance.mongo.extraction import query_mongodb
from finance.params import *
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
EXCHANGE_LS = ['NASDAQ', 'NYSE', 'PNK']

exchange_ls = EXCHANGE_LS
table_name = 'company_profile'

In [6]:
result = query_mongodb(exchange_ls=exchange_ls, table=table_name)

exchange {} - 45600


In [7]:
full_df = pd.DataFrame(result)

In [8]:
full_df = full_df.drop_duplicates(subset='companyName')
full_df.reset_index(inplace=True)
full_df.description

0                                                     None
1        LAVA Medtech Acquisition Corp. does not have s...
2        Capital Product Partners L.P., a shipping comp...
3        Comtech Telecommunications Corp., together wit...
4        Vallon Pharmaceuticals, Inc., a biopharmaceuti...
                               ...                        
18210    Advanced Oxygen Technologies, Inc., through it...
18211    OPTiM Corporation provides various internet-ba...
18212    Hollywood Intermediate, Inc. engages in the de...
18213    OSAKA Titanium technologies Co.,Ltd. manufactu...
18214                                                 None
Name: description, Length: 18215, dtype: object

In [9]:
full_df.fillna(np.nan, inplace=True)
full_df.dropna(subset='description', inplace=True)

In [10]:
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [11]:
embedded_docs = embeddings.embed_documents(full_df.description.to_list())

In [12]:
umap_model = UMAP(n_neighbors=20, n_components=5, min_dist=0.00, metric='cosine')
reduced_text_embeddings = umap_model.fit_transform(embedded_docs)

### Cross validation

- ``` min_cluster_size ``` - smalles size grouping to be considered as a cluster
- ``` min_samples ``` - larger - more points considered as noise as clusters more restriced to progressively more dense areas
- ```cluster_selection_epsilon ``` - ensures clusters below given treshold are not split up further
- ``` alpha ``` - conservative clustering - best to leave

In [13]:
%env

TOKENIZERS_PARALLELISM = False

In [14]:
from itertools import product
from tqdm import tqdm

from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score


def dbscan_gridsearch(data, min_cluster_size_ls, min_samples_ls, cluster_selection_epsilon_ls, n_neighbors_ls, n_components_ls, min_dist_ls, metric_ls, return_model = False):
    
    experiment_ls = []
    
    for n_neighbors, n_components, min_dist, metric in tqdm(list(product(n_neighbors_ls, n_components_ls, min_dist_ls, metric_ls))):
        
        umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=min_dist, metric=metric)
        
        reduced_text_embeddings = umap_model.fit_transform(data)
    
        for min_clust, min_sample, epsilon in tqdm(list(product(min_cluster_size_ls, min_samples_ls, cluster_selection_epsilon_ls))):
            
            umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=min_dist)
            
            reduced_text_embeddings = umap_model.fit_transform(data)
            
            hdbscan_model = HDBSCAN(
                min_cluster_size=min_clust,
                metric='euclidean',
                min_samples=min_sample,
                cluster_selection_epsilon=epsilon,
                prediction_data=False)
            
            text_cluster = hdbscan_model.fit(reduced_text_embeddings)
            
            unique, counts = np.unique(text_cluster.labels_, return_counts=True)
                    
            silhouette = silhouette_score(reduced_text_embeddings, text_cluster.labels_, metric='euclidean')
            calinksi = calinski_harabasz_score(reduced_text_embeddings, text_cluster.labels_)
            davies = davies_bouldin_score(reduced_text_embeddings, text_cluster.labels_)
            percentage_outlier = counts[0]/np.sum(counts)
            
            run_dict = {
                'min_cluster_size':min_clust,
                'min_samples':min_sample,
                'epsilon':epsilon,
                'n_neighbors':n_neighbors,
                'n_components':n_components,
                'min_dist':min_dist,
                'silhouette_score': silhouette,
                'calinski': calinksi,
                'davies': davies,
                'percentage_outliers': percentage_outlier,
                'count_classes' : len(unique)
            }
            
            experiment_ls.append(run_dict)
    
    if return_model:
        return pd.DataFrame(experiment_ls), hdbscan_model, umap_model
            
    return pd.DataFrame(experiment_ls)
        
        
        
    


In [15]:
min_cluster_size = [15, 20, 40, 60, 80, 100]
min_samples = [10, 20, 30, 40]
cluster_selection_epsilon = [0.1, 0.25, 0.5]
n_neighbors = [10,20,30]
n_components = [3, 7, 10, 12]
min_dist = [0.00, 0.01, 0.05]
metric = ['cosine']

In [16]:
df = dbscan_gridsearch(embedded_docs, min_cluster_size, min_samples, cluster_selection_epsilon, n_neighbors, n_components, min_dist, metric)

df

  0%|          | 0/36 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling p

Unnamed: 0,min_cluster_size,min_samples,epsilon,n_neighbors,n_components,min_dist,silhouette_score,calinski,davies,percentage_outliers,count_classes
0,15,10,0.10,10,3,0.00,0.049491,400.630412,1.545988,0.332570,127
1,15,10,0.25,10,3,0.00,-0.504820,182.819172,2.449719,0.051828,21
2,15,10,0.50,10,3,0.00,-0.298727,573.054370,1.356806,0.001937,7
3,15,20,0.10,10,3,0.00,0.138067,954.538029,1.259733,0.312144,96
4,15,20,0.25,10,3,0.00,0.040537,4525.504904,1.569193,0.106063,29
...,...,...,...,...,...,...,...,...,...,...,...
2587,100,30,0.25,30,12,0.05,0.177740,2646.341305,0.633235,0.004109,3
2588,100,30,0.50,30,12,0.05,0.028471,586.020196,3.920333,0.086459,3
2589,100,40,0.10,30,12,0.05,0.147643,1708.390654,1.198839,0.354053,19
2590,100,40,0.25,30,12,0.05,0.095906,1566.145735,1.129621,0.398427,21


In [17]:
df.to_csv('experiments.csv')

In [18]:
df = df.sort_values('silhouette_score')
df

Unnamed: 0,min_cluster_size,min_samples,epsilon,n_neighbors,n_components,min_dist,silhouette_score,calinski,davies,percentage_outliers,count_classes
1,15,10,0.25,10,3,0.00,-0.504820,182.819172,2.449719,0.051828,21
145,15,10,0.25,10,3,0.05,-0.503764,222.363750,2.363410,0.060574,17
157,20,10,0.25,10,3,0.05,-0.498931,231.132854,2.681743,0.056994,15
865,15,10,0.25,20,3,0.00,-0.410324,497.244592,3.334561,0.023478,10
937,15,10,0.25,20,3,0.01,-0.409550,495.006256,4.870614,0.042026,10
...,...,...,...,...,...,...,...,...,...,...,...
2126,60,10,0.50,30,7,0.05,0.835428,4247.866360,0.122074,0.004109,2
2045,40,20,0.50,30,7,0.01,0.835533,4209.106294,0.121006,0.004109,2
1918,60,40,0.25,30,3,0.05,0.835634,4196.085377,0.121429,0.004109,2
2099,15,40,0.50,30,7,0.05,0.838740,4406.504671,0.119961,0.004109,2


In [32]:
df['rank_silhouette'] = df.silhouette_score.rank(ascending=False, method='dense')
df['rank_calinksi'] = df.calinski.rank(ascending=False, method='dense')
df['rank_davies'] = df.davies.rank(ascending=True, method='dense')
df['average_rank'] = (df.rank_silhouette + df.rank_calinksi + df.rank_davies)/3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rank_silhouette'] = df.silhouette_score.rank(ascending=False, method='dense')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rank_calinksi'] = df.calinski.rank(ascending=False, method='dense')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rank_davies'] = df.davies.rank(ascending=True, 

In [33]:
df = df[df['count_classes']>2]

In [38]:
df.sort_values('average_rank', ascending=True).head(30)

Unnamed: 0,min_cluster_size,min_samples,epsilon,n_neighbors,n_components,min_dist,silhouette_score,calinski,davies,percentage_outliers,count_classes,rank_silhouette,rank_calinksi,rank_davies,average_rank
1814,20,10,0.5,30,3,0.01,0.412481,6304.876207,0.416856,0.004109,3,3.0,22.0,1.0,8.666667
1269,60,40,0.1,20,7,0.05,0.426157,5578.654323,0.428131,0.004109,3,1.0,52.0,2.0,18.333333
899,40,40,0.5,20,3,0.0,0.268742,5045.068652,0.515214,0.004109,4,27.0,78.0,3.0,36.0
1831,40,30,0.25,30,3,0.01,0.282166,8164.769441,1.065806,0.201679,16,18.0,4.0,117.0,46.333333
1798,100,40,0.25,30,3,0.0,0.296254,4562.14521,0.749879,0.006398,4,9.0,106.0,53.0,56.0
1846,60,40,0.25,30,3,0.01,0.245747,7181.037926,1.067895,0.242296,18,65.0,9.0,119.0,64.333333
1774,60,40,0.25,30,3,0.0,0.259937,9242.398431,1.099462,0.213535,15,40.0,2.0,191.0,77.666667
1843,60,30,0.25,30,3,0.01,0.245843,8760.603991,1.104887,0.186829,13,64.0,3.0,212.0,93.0
1120,60,20,0.25,20,7,0.0,0.210151,6806.402409,1.074001,0.242648,20,176.0,13.0,132.0,107.0
256,60,20,0.25,10,7,0.0,0.255531,6109.740871,1.114665,0.200857,25,48.0,29.0,256.0,111.0


In [134]:
min_cluster_size = [60]
min_samples = [40]
cluster_selection_epsilon = [0.25]
n_neighbors = [30]
n_components = [3]
min_dist = [0.01]
metric = ['cosine']

In [135]:
df2, hbd_scan2, umap2 = dbscan_gridsearch(embedded_docs, min_cluster_size, min_samples, cluster_selection_epsilon, n_neighbors, n_components, min_dist, metric, return_model=True)


  0%|          | 0/1 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling pa

In [136]:
df2

Unnamed: 0,min_cluster_size,min_samples,epsilon,n_neighbors,n_components,min_dist,silhouette_score,calinski,davies,percentage_outliers,count_classes
0,60,40,0.25,30,3,0.01,0.260498,8186.122646,1.121321,0.217996,16


In [137]:
test_df = full_df.copy()

test_df['cluster'] = hbd_scan2.labels_

In [138]:
test_df['cluster'].value_counts()

cluster
-1     3714
 7     2327
 14    2034
 3     1844
 6     1429
 10    1371
 13    1228
 11    1161
 1      919
 5      253
 12     216
 4      161
 9      158
 2       84
 0       70
 8       68
Name: count, dtype: int64

In [171]:
test_df[test_df['companyName'].str.contains('Zoetis')]

Unnamed: 0,index,_id,symbol,price,beta,volAvg,mktCap,lastDiv,range,changes,...,dcfDiff,dcf,image,ipoDate,defaultImage,isEtf,isActivelyTrading,isAdr,isFund,cluster
5083,7969,657df3b8b2cffe5b07e27573,ZTS,196.29,0.844,1867709,90119490000.0,1.73,140.76-201.92,-3.8,...,68.69877,127.591228,https://financialmodelingprep.com/image-stock/...,2013-02-01,False,False,True,False,False,14


In [166]:
test_df[test_df['cluster'] == 14][['companyName', 'symbol', 'cluster', 'description']]

Unnamed: 0,companyName,symbol,cluster,description
4,"Vallon Pharmaceuticals, Inc.",VLON,14,"Vallon Pharmaceuticals, Inc., a biopharmaceuti..."
10,Allurion Technologies Inc.,ALUR,14,Allurion Technologies Inc. focuses on ending o...
13,POINT Biopharma Global Inc.,PNT,14,"POINT Biopharma Global Inc., a radiopharmaceut..."
14,Aptorum Group Limited,APM,14,"Aptorum Group Limited, a biopharmaceutical com..."
18,"Reviva Pharmaceuticals Holdings, Inc.",RVPH,14,"Reviva Pharmaceuticals Holdings, Inc., a clini..."
...,...,...,...,...
18180,FluroTech Ltd.,FLURF,14,"FluroTech Ltd., a technology and marketing com..."
18183,"Springs Rejuvenation, Inc.",SPRJ,14,"Springs Rejuvenation, LLC offers aging and ste..."
18187,ImmuPharma plc,IMMPF,14,"ImmuPharma plc, a biopharmaceutical company, d..."
18204,Venus Medtech (Hangzhou) Inc.,VMTHF,14,Venus Medtech (Hangzhou) Inc. develops and com...


In [173]:
test_df.columns

Index(['index', '_id', 'symbol', 'price', 'beta', 'volAvg', 'mktCap',
       'lastDiv', 'range', 'changes', 'companyName', 'currency', 'cik', 'isin',
       'cusip', 'exchange', 'exchangeShortName', 'industry', 'website',
       'description', 'ceo', 'sector', 'country', 'fullTimeEmployees', 'phone',
       'address', 'city', 'state', 'zip', 'dcfDiff', 'dcf', 'image', 'ipoDate',
       'defaultImage', 'isEtf', 'isActivelyTrading', 'isAdr', 'isFund',
       'cluster'],
      dtype='object')

In [176]:
df_ag = test_df[['symbol', 'companyName','description', 'cluster']]
df_ag['embeddings'] = embedded_docs
df_ag

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ag['embeddings'] = embedded_docs


Unnamed: 0,symbol,companyName,description,cluster,embeddings
1,LVAC,LAVA Medtech Acquisition Corp.,LAVA Medtech Acquisition Corp. does not have s...,1,"[0.059244390577077866, -0.03414931520819664, 0..."
2,CPLP,Capital Product Partners L.P.,"Capital Product Partners L.P., a shipping comp...",9,"[0.0026629117783159018, -0.09011717885732651, ..."
3,CMTL,Comtech Telecommunications Corp.,"Comtech Telecommunications Corp., together wit...",7,"[-0.08909572660923004, -0.00865918304771185, -..."
4,VLON,"Vallon Pharmaceuticals, Inc.","Vallon Pharmaceuticals, Inc., a biopharmaceuti...",14,"[-0.0034944824874401093, -0.0338095985352993, ..."
5,SOAC-UN,Sustainable Opportunities Acquisition Corp.,Sustainable Opportunities Acquisition Corp. do...,1,"[0.042223840951919556, 0.04522120952606201, 0...."
...,...,...,...,...,...
18209,NXTH,"NXT Nutritionals Holdings, Inc.","NXT Nutritionals Holdings, Inc., through its s...",13,"[-0.04242871701717377, -0.07263299822807312, 0..."
18210,AOXY,"Advanced Oxygen Technologies, Inc.","Advanced Oxygen Technologies, Inc., through it...",-1,"[-0.07740608602762222, 0.0631069764494896, -0...."
18211,OTPMF,OPTiM Corporation,OPTiM Corporation provides various internet-ba...,7,"[-0.0986807718873024, -0.00940666999667883, 0...."
18212,HYWI,"Hollywood Intermediate, Inc.","Hollywood Intermediate, Inc. engages in the de...",7,"[0.00044148784945718944, -0.1268903911113739, ..."


In [179]:
top_n = 40
diversity = 0.5
cluster_dict = {}

for cluster, df in df_ag.groupby('cluster'):
    if cluster == -1:
        continue

    # find the most representative documents
    candidate_d = cosine_similarity(df['embeddings'].tolist(), df['embeddings'].tolist())
    candidate_d_sum = candidate_d.sum(axis=1)
    doc_list = [np.argmax(candidate_d.sum(axis=1))]
    candidates_idx = [i for i in range(len(df)) if i != doc_list[0]]

    # filter based on maximal marginal relevance
    for _ in range(top_n - 1):
        candidate_similarities = candidate_d.sum(axis=1)[candidates_idx]
        target_similarities = np.max(candidate_d[candidates_idx][:, doc_list], axis=1)

        # Calculate MMR
        mmr = (1 - diversity) * candidate_similarities - diversity * target_similarities
        # Update keywords & candidates
        mmr_idx = candidates_idx[np.argmax(mmr)]
        doc_list.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    cluster_dict[cluster] = {'doc': [df['description'].tolist()[idx] for idx in doc_list]}

In [180]:
cluster_dict

{0: {'doc': ['',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '']},
 1: {'doc': ['5:01 Acquisition Corp. does not have significant operations. It intends to effect a merger, share exchange, asset acquisition, stock purchase, recapitalization, reorganization, or other similar business combination with one or more businesses. The company was incorporated in 2020 and is based in San Francisco, California.',
   'EG Acquisition Corp., a blank check company, intends to effect a merger, capital stock exchange, asset acquisition, stock purchase, reorganization, or related business combination with one or more businesses. The company was incorporated in 2021 and is based in New York, New York.',
   'one intends to effect a merger, share exchange, asset acquisition, share pur

In [182]:
local_path = (
    "../llm_models/gpt4all-falcon-q4_0.gguf" 
)

# Callbacks support token-wise streaming
callbacks = [StreamingStdOutCallbackHandler()]

# Verbose is required to pass to the callback manager
llm = GPT4All(model=local_path, callbacks=callbacks, verbose=True)

In [186]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# delimiter = "####"
template = """\
    You will be provided with multiple documents that form the same cluster. \
    The documents will be delimited with ### characters. \
    Your task is to define a topic title that is a good representation of all the listed documents, give a small summary in 20 words. \
    Documents: {documents}
    
    """

# template = """\
#     You will be provided with multiple documents that form the same cluster. \
#     The documents will be delimited with ### characters. \
#     Your task is to define a topic title that is well representing all the listed documents.\
#     Output starts with 'Title: '
#     ###{documents}###
#     """

prompt = PromptTemplate(
input_variables=["documents"], template=template)

In [184]:
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [187]:
documents = "\n".join([f"### {text}" for c, text in enumerate(cluster_dict[cluster]['doc'])])

In [188]:
output = llm_chain.run(documents)
output

ERROR: The prompt size exceeds the context window size and cannot be processed.

LLaMA ERROR: The prompt is 10271 tokens and the context window is 2048!


'ERROR: The prompt size exceeds the context window size and cannot be processed.'