In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
from hdbscan import HDBSCAN
from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain.embeddings import HuggingFaceEmbeddings

from finance.mongo.extraction import query_mongodb
from finance.params import *
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
EXCHANGE_LS = ['NASDAQ', 'NYSE', 'PNK']

exchange_ls = EXCHANGE_LS
table_name = 'company_profile'

In [4]:
result = query_mongodb(exchange_ls=exchange_ls, table=table_name)

exchange {} - 45600


In [5]:
df = pd.DataFrame(result)

In [6]:
df = df.drop_duplicates(subset='companyName')
df.reset_index(inplace=True)
df.description

0                                                     None
1        LAVA Medtech Acquisition Corp. does not have s...
2        Capital Product Partners L.P., a shipping comp...
3        Comtech Telecommunications Corp., together wit...
4        Vallon Pharmaceuticals, Inc., a biopharmaceuti...
                               ...                        
18210    Advanced Oxygen Technologies, Inc., through it...
18211    OPTiM Corporation provides various internet-ba...
18212    Hollywood Intermediate, Inc. engages in the de...
18213    OSAKA Titanium technologies Co.,Ltd. manufactu...
18214                                                 None
Name: description, Length: 18215, dtype: object

In [7]:
df.fillna(np.nan, inplace=True)
df.dropna(subset='description', inplace=True)

In [8]:
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [9]:
embedded_docs = embeddings.embed_documents(df.description.to_list())

In [64]:
umap_model = UMAP(n_neighbors=20, n_components=5, min_dist=0.00, metric='cosine')
reduced_text_embeddings = umap_model.fit_transform(embedded_docs)

In [65]:

hdbscan_model = HDBSCAN(
    min_cluster_size=50,
    metric='euclidean',
    min_samples=5,
    prediction_data=False)

text_cluster = hdbscan_model.fit(reduced_text_embeddings)
df['cluster'] = text_cluster.labels_

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [66]:
text_cluster.labels_

array([ 3,  8, 37, ..., 37, 40, 32])

In [67]:
df.cluster.value_counts()

cluster
-1     5432
 38    1593
 33    1016
 23     956
 3      927
 37     614
 32     557
 16     445
 43     381
 11     353
 24     288
 21     282
 18     274
 42     238
 35     237
 29     210
 5      198
 34     198
 12     186
 40     185
 8      181
 28     171
 39     132
 19     131
 9      126
 30     123
 31     110
 27     108
 22     102
 26      97
 4       93
 44      87
 1       86
 17      83
 36      83
 15      82
 41      77
 25      77
 6       74
 2       71
 0       70
 20      63
 10      62
 13      62
 14      60
 7       56
Name: count, dtype: int64

In [79]:
df[df.companyName.str.contains('Tesla')]

Unnamed: 0,index,_id,symbol,price,beta,volAvg,mktCap,lastDiv,range,changes,...,dcfDiff,dcf,image,ipoDate,defaultImage,isEtf,isActivelyTrading,isAdr,isFund,cluster
1710,1949,657dd450b2cffe5b07e23403,TSLA,253.5,2.262,120709817,805856200000.0,0.0,101.81-299.29,2.45,...,3.73173,181.632,https://financialmodelingprep.com/image-stock/...,2010-06-29,False,False,True,False,False,-1
12280,25922,658310ce570a04d5526807e1,TXLZF,1e-06,44.126,0,248129.0,0.0,1.0E-6-1.0E-4,-0.0001,...,,0.0,https://financialmodelingprep.com/image-stock/...,2010-06-08,True,False,True,False,False,-1


In [53]:
df[df.cluster==42].companyName

4          Vallon Pharmaceuticals, Inc.
13          POINT Biopharma Global Inc.
14                Aptorum Group Limited
37       Bellicum Pharmaceuticals, Inc.
48                       HilleVax, Inc.
                      ...              
18107     JCR Pharmaceuticals Co., Ltd.
18118             Helix BioPharma Corp.
18165     Biostar Pharmaceuticals, Inc.
18180                    FluroTech Ltd.
18187                    ImmuPharma plc
Name: companyName, Length: 987, dtype: object

### Metrics

### Cross validation

- ``` min_cluster_size ``` - smalles size grouping to be considered as a cluster
- ``` min_samples ``` - larger - more points considered as noise as clusters more restriced to progressively more dense areas
- ```cluster_selection_epsilon ``` - ensures clusters below given treshold are not split up further
- ``` alpha ``` - conservative clustering - best to leave

In [110]:
%env

TOKENIZERS_PARALLELISM = False

In [111]:
from itertools import product
from tqdm import tqdm

from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score


def dbscan_gridsearch(data, min_cluster_size_ls, min_samples_ls, cluster_selection_epsilon_ls):
    
    experiment_ls = []
    
    for min_clust, min_sample, epsilon in tqdm(list(product(min_cluster_size_ls, min_samples_ls, cluster_selection_epsilon_ls))):
        
        hdbscan_model = HDBSCAN(
            min_cluster_size=min_clust,
            metric='euclidean',
            min_samples=min_sample,
            cluster_selection_epsilon=epsilon,
            prediction_data=False)
        
        text_cluster = hdbscan_model.fit(data)
        
        silhouette = silhouette_score(reduced_text_embeddings, text_cluster.labels_, metric='euclidean')
        calinksi = calinski_harabasz_score(reduced_text_embeddings, text_cluster.labels_)
        davies = davies_bouldin_score(reduced_text_embeddings, text_cluster.labels_)
        
        run_dict = {
            'min_cluster_size':min_clust,
            'min_samples':min_sample,
            'epsilon':epsilon,
            'silhouette_score': silhouette,
            'calinski': calinksi,
            'davies': davies
        }
        
        experiment_ls.append(run_dict)
        
    return pd.DataFrame(experiment_ls)
        
        
        
    


In [112]:
min_cluster_size = [20, 40, 60, 80, 100, 120]
min_samples = [5, 10, 20, 30]
cluster_selection_epsilon = [0.25, 0.5, 1, 2]

In [113]:
df = dbscan_gridsearch(reduced_text_embeddings, min_cluster_size, min_samples, cluster_selection_epsilon)

df

  0%|          | 0/96 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling p

Unnamed: 0,min_cluster_size,min_samples,epsilon,silhouette_score,calinski,davies
0,20,5,0.25,-0.380258,408.556347,2.456342
1,20,5,0.50,-0.266571,819.926711,3.046763
2,20,5,1.00,0.810630,3183.544670,0.135424
3,20,5,2.00,0.810630,3183.544670,0.135424
4,20,10,0.25,-0.460873,384.617946,2.338334
...,...,...,...,...,...,...
91,120,20,2.00,0.132954,682.677066,2.447728
92,120,30,0.25,0.196480,2314.313522,1.293146
93,120,30,0.50,0.119044,663.261651,2.568888
94,120,30,1.00,0.119044,663.261651,2.568888


In [117]:
df = df.sort_values('silhouette_score')
df

Unnamed: 0,min_cluster_size,min_samples,epsilon,silhouette_score,calinski,davies
4,20,10,0.25,-0.460873,384.617946,2.338334
0,20,5,0.25,-0.380258,408.556347,2.456342
16,40,5,0.25,-0.364953,526.235703,2.608840
20,40,10,0.25,-0.362908,527.926471,2.627801
48,80,5,0.25,-0.325171,209.853830,2.810212
...,...,...,...,...,...,...
19,40,5,2.00,0.810630,3183.544670,0.135424
18,40,5,1.00,0.810630,3183.544670,0.135424
15,20,30,2.00,0.810630,3183.544670,0.135424
46,60,30,1.00,0.810630,3183.544670,0.135424


In [151]:
df['rank_silhouette'] = df.silhouette_score.rank(ascending=False, method='dense')
df['rank_calinksi'] = df.calinski.rank(ascending=False, method='dense')
df['rank_davies'] = df.davies.rank(ascending=True, method='dense')
df['average_rank'] = (df.rank_silhouette + df.rank_calinksi + df.rank_davies)/3

In [158]:
df[df['average_rank'] == df['average_rank'].min()]

Unnamed: 0,min_cluster_size,min_samples,epsilon,silhouette_score,calinski,davies,rank_silhouette,rank_calinksi,rank_davies,average_rank
44,60,30,0.25,0.198197,4212.416461,1.198699,3.0,3.0,3.0,3.0
14,20,30,1.0,0.81063,3183.54467,0.135424,1.0,7.0,1.0,3.0
2,20,5,1.0,0.81063,3183.54467,0.135424,1.0,7.0,1.0,3.0
43,60,20,2.0,0.81063,3183.54467,0.135424,1.0,7.0,1.0,3.0
42,60,20,1.0,0.81063,3183.54467,0.135424,1.0,7.0,1.0,3.0
39,60,10,2.0,0.81063,3183.54467,0.135424,1.0,7.0,1.0,3.0
35,60,5,2.0,0.81063,3183.54467,0.135424,1.0,7.0,1.0,3.0
34,60,5,1.0,0.81063,3183.54467,0.135424,1.0,7.0,1.0,3.0
31,40,30,2.0,0.81063,3183.54467,0.135424,1.0,7.0,1.0,3.0
30,40,30,1.0,0.81063,3183.54467,0.135424,1.0,7.0,1.0,3.0


In [153]:
df.davies

4     2.338334
0     2.456342
16    2.608840
20    2.627801
48    2.810212
        ...   
19    0.135424
18    0.135424
15    0.135424
46    0.135424
47    0.135424
Name: davies, Length: 96, dtype: float64