In [198]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
from hdbscan import HDBSCAN
from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain.embeddings import HuggingFaceEmbeddings

from finance.mongo.extraction import query_mongodb
from finance.params import *
import numpy as np
import pandas as pd

In [199]:
EXCHANGE_LS = ['NASDAQ', 'NYSE', 'PNK']

exchange_ls = EXCHANGE_LS
table_name = 'company_profile'

In [200]:
result = query_mongodb(exchange_ls=exchange_ls, table=table_name)

exchange {} - 45600


In [202]:
full_df = pd.DataFrame(result)

In [203]:
full_df = full_df.drop_duplicates(subset='companyName')
full_df.reset_index(inplace=True)
full_df.description

0                                                     None
1        LAVA Medtech Acquisition Corp. does not have s...
2        Capital Product Partners L.P., a shipping comp...
3        Comtech Telecommunications Corp., together wit...
4        Vallon Pharmaceuticals, Inc., a biopharmaceuti...
                               ...                        
18210    Advanced Oxygen Technologies, Inc., through it...
18211    OPTiM Corporation provides various internet-ba...
18212    Hollywood Intermediate, Inc. engages in the de...
18213    OSAKA Titanium technologies Co.,Ltd. manufactu...
18214                                                 None
Name: description, Length: 18215, dtype: object

In [204]:
full_df.fillna(np.nan, inplace=True)
full_df.dropna(subset='description', inplace=True)

In [205]:
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [206]:
embedded_docs = embeddings.embed_documents(full_df.description.to_list())

In [207]:
umap_model = UMAP(n_neighbors=20, n_components=5, min_dist=0.00, metric='cosine')
reduced_text_embeddings = umap_model.fit_transform(embedded_docs)

### Cross validation

- ``` min_cluster_size ``` - smalles size grouping to be considered as a cluster
- ``` min_samples ``` - larger - more points considered as noise as clusters more restriced to progressively more dense areas
- ```cluster_selection_epsilon ``` - ensures clusters below given treshold are not split up further
- ``` alpha ``` - conservative clustering - best to leave

In [110]:
%env

TOKENIZERS_PARALLELISM = False

In [262]:
from itertools import product
from tqdm import tqdm

from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score


def dbscan_gridsearch(data, min_cluster_size_ls, min_samples_ls, cluster_selection_epsilon_ls, n_neighbors_ls, n_components_ls, min_dist_ls, return_model = False):
    
    experiment_ls = []
    
    for n_neighbors, n_components, min_dist in tqdm(list(product(n_neighbors_ls, n_components_ls, min_dist_ls))):
        
        umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=min_dist)
        
        reduced_text_embeddings = umap_model.fit_transform(data)
    
        for min_clust, min_sample, epsilon in tqdm(list(product(min_cluster_size_ls, min_samples_ls, cluster_selection_epsilon_ls))):
            
            umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=min_dist)
            
            reduced_text_embeddings = umap_model.fit_transform(data)
            
            hdbscan_model = HDBSCAN(
                min_cluster_size=min_clust,
                metric='euclidean',
                min_samples=min_sample,
                cluster_selection_epsilon=epsilon,
                prediction_data=False)
            
            text_cluster = hdbscan_model.fit(reduced_text_embeddings)
            
            unique, counts = np.unique(text_cluster.labels_, return_counts=True)
                    
            silhouette = silhouette_score(reduced_text_embeddings, text_cluster.labels_, metric='euclidean')
            calinksi = calinski_harabasz_score(reduced_text_embeddings, text_cluster.labels_)
            davies = davies_bouldin_score(reduced_text_embeddings, text_cluster.labels_)
            percentage_outlier = counts[0]/np.sum(counts)
            
            run_dict = {
                'min_cluster_size':min_clust,
                'min_samples':min_sample,
                'epsilon':epsilon,
                'n_neighbors':n_neighbors,
                'n_components':n_components,
                'min_dist':min_dist,
                'silhouette_score': silhouette,
                'calinski': calinksi,
                'davies': davies,
                'percentage_outliers': percentage_outlier,
                'count_classes' : len(unique)
            }
            
            experiment_ls.append(run_dict)
    
    if return_model:
        return pd.DataFrame(experiment_ls), hdbscan_model
            
    return pd.DataFrame(experiment_ls)
        
        
        
    


In [263]:
min_cluster_size = [5, 10, 15, 20]
min_samples = [5, 10, 20, 30]
cluster_selection_epsilon = [0.25]
n_neighbors = [5,10,15,20,30]
n_components = [3, 5, 7, 10, 12]
min_dist = [0.00, 0.01, 0.05, 0.1]

In [265]:
df = dbscan_gridsearch(reduced_text_embeddings, min_cluster_size, min_samples, cluster_selection_epsilon, n_neighbors, n_components, min_dist)

df

100%|██████████| 16/16 [02:10<00:00,  8.16s/it]
100%|██████████| 16/16 [02:14<00:00,  8.40s/it]it]
100%|██████████| 16/16 [02:15<00:00,  8.44s/it]it]
100%|██████████| 16/16 [02:08<00:00,  8.01s/it]it]
100%|██████████| 16/16 [02:04<00:00,  7.75s/it]it]
100%|██████████| 16/16 [02:01<00:00,  7.58s/it]it]
100%|██████████| 16/16 [02:09<00:00,  8.08s/it]it]
100%|██████████| 16/16 [01:56<00:00,  7.25s/it]it]
100%|██████████| 16/16 [01:47<00:00,  6.69s/it]it]
100%|██████████| 16/16 [01:57<00:00,  7.34s/it]it]
100%|██████████| 16/16 [02:14<00:00,  8.39s/it]/it]
100%|██████████| 16/16 [01:54<00:00,  7.16s/it]/it]
100%|██████████| 16/16 [01:59<00:00,  7.45s/it]/it]
100%|██████████| 16/16 [01:58<00:00,  7.39s/it]/it]
100%|██████████| 16/16 [02:26<00:00,  9.15s/it]/it]
100%|██████████| 16/16 [03:05<00:00, 11.61s/it]/it]
100%|██████████| 16/16 [03:02<00:00, 11.38s/it]/it]
100%|██████████| 16/16 [02:41<00:00, 10.06s/it]/it]
100%|██████████| 16/16 [02:28<00:00,  9.27s/it]/it]
100%|██████████| 16/16 [0

Unnamed: 0,min_cluster_size,min_samples,epsilon,n_neighbors,n_components,min_dist,silhouette_score,calinski,davies,percentage_outliers,count_classes
0,5,5,0.25,5,3,0.0,0.490172,5111.191200,0.905254,0.004872,288
1,5,10,0.25,5,3,0.0,0.528314,1813.256948,1.080259,0.032752,247
2,5,20,0.25,5,3,0.0,0.465434,533.136685,1.170179,0.124435,216
3,5,30,0.25,5,3,0.0,0.405478,681.168183,1.287599,0.138111,143
4,10,5,0.25,5,3,0.0,0.544184,2624.642764,1.258665,0.014733,232
...,...,...,...,...,...,...,...,...,...,...,...
1595,15,30,0.25,30,12,0.1,0.332087,981.245436,1.061740,0.185185,107
1596,20,5,0.25,30,12,0.1,0.274712,4799.253497,1.132082,0.033867,62
1597,20,10,0.25,30,12,0.1,0.364589,3736.226741,1.092751,0.059107,81
1598,20,20,0.25,30,12,0.1,0.326781,1189.315025,1.059243,0.145213,110


In [266]:
df = df.sort_values('silhouette_score')
df

Unnamed: 0,min_cluster_size,min_samples,epsilon,n_neighbors,n_components,min_dist,silhouette_score,calinski,davies,percentage_outliers,count_classes
1392,5,5,0.25,30,5,0.10,0.063069,2853.577915,1.224884,0.018020,91
1072,5,5,0.25,20,5,0.10,0.075692,3743.952719,1.130424,0.021189,110
1456,5,5,0.25,30,7,0.10,0.099041,4639.074018,1.095757,0.021013,87
1076,10,5,0.25,20,5,0.10,0.106659,3821.511776,1.220198,0.024417,96
992,5,5,0.25,20,3,0.05,0.106736,6836.667659,1.089330,0.009920,99
...,...,...,...,...,...,...,...,...,...,...,...
13,20,10,0.25,5,3,0.00,0.566101,1695.469576,1.441878,0.044491,183
269,20,10,0.25,5,12,0.00,0.568994,1009.340620,1.166116,0.069731,164
275,5,30,0.25,5,12,0.01,0.570750,903.847141,1.207784,0.099196,108
206,20,20,0.25,5,10,0.00,0.587658,1021.073634,1.160792,0.070787,141


In [276]:
df.sort_values('average_rank')

Unnamed: 0,min_cluster_size,min_samples,epsilon,n_neighbors,n_components,min_dist,silhouette_score,calinski,davies,percentage_outliers,count_classes,rank_silhouette,rank_calinksi,rank_davies,average_rank
336,5,5,0.25,10,3,0.01,0.495191,6485.753720,0.915073,0.002641,139,53.0,170.0,7.0,76.666667
384,5,5,0.25,10,5,0.00,0.452243,6896.698138,0.983678,0.003639,138,141.0,142.0,19.0,100.666667
0,5,5,0.25,5,3,0.00,0.490172,5111.191200,0.905254,0.004872,288,59.0,270.0,5.0,111.333333
80,5,5,0.25,5,5,0.01,0.498380,3710.607419,0.910126,0.006222,272,46.0,441.0,6.0,164.333333
334,20,20,0.25,10,3,0.00,0.534935,3420.580475,0.993634,0.058285,118,17.0,477.0,23.0,172.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,15,30,0.25,5,3,0.10,0.275587,861.236205,1.288049,0.147092,102,1265.0,1377.0,1474.0,1372.000000
439,10,30,0.25,10,5,0.10,0.286159,736.538465,1.294522,0.198157,111,1213.0,1454.0,1481.0,1382.666667
690,5,20,0.25,15,3,0.10,0.256100,856.983742,1.253033,0.160357,136,1362.0,1381.0,1416.0,1386.333333
55,10,30,0.25,5,3,0.10,0.265043,850.410942,1.293567,0.157070,93,1322.0,1387.0,1478.0,1395.666667


In [267]:
df['rank_silhouette'] = df.silhouette_score.rank(ascending=False, method='dense')
df['rank_calinksi'] = df.calinski.rank(ascending=False, method='dense')
df['rank_davies'] = df.davies.rank(ascending=True, method='dense')
df['average_rank'] = (df.rank_silhouette + df.rank_calinksi + df.rank_davies)/3

In [268]:
df = df[df['count_classes']>2]

In [269]:
df.sort_values('average_rank', ascending=True).head(30)

Unnamed: 0,min_cluster_size,min_samples,epsilon,n_neighbors,n_components,min_dist,silhouette_score,calinski,davies,percentage_outliers,count_classes,rank_silhouette,rank_calinksi,rank_davies,average_rank
336,5,5,0.25,10,3,0.01,0.495191,6485.75372,0.915073,0.002641,139,53.0,170.0,7.0,76.666667
384,5,5,0.25,10,5,0.0,0.452243,6896.698138,0.983678,0.003639,138,141.0,142.0,19.0,100.666667
0,5,5,0.25,5,3,0.0,0.490172,5111.1912,0.905254,0.004872,288,59.0,270.0,5.0,111.333333
80,5,5,0.25,5,5,0.01,0.49838,3710.607419,0.910126,0.006222,272,46.0,441.0,6.0,164.333333
334,20,20,0.25,10,3,0.0,0.534935,3420.580475,0.993634,0.058285,118,17.0,477.0,23.0,172.333333
224,5,5,0.25,5,10,0.05,0.543779,3000.119879,0.888725,0.010448,291,12.0,543.0,3.0,186.0
320,5,5,0.25,10,3,0.0,0.387299,7661.874602,0.886637,0.002583,140,503.0,95.0,2.0,200.0
321,5,10,0.25,10,3,0.0,0.391875,5849.244315,0.954104,0.012972,121,477.0,213.0,12.0,234.0
288,5,5,0.25,5,12,0.05,0.457713,2370.215872,0.890187,0.011152,285,129.0,674.0,4.0,269.0
1538,5,20,0.25,30,12,0.0,0.436353,4160.407445,1.048336,0.053354,85,222.0,383.0,242.0,282.333333


In [279]:
df.iloc[336]

min_cluster_size         15.000000
min_samples               5.000000
epsilon                   0.250000
n_neighbors              30.000000
n_components             10.000000
min_dist                  0.000000
silhouette_score          0.275686
calinski               9615.409420
davies                    1.231428
percentage_outliers       0.008335
count_classes            67.000000
rank_silhouette        1264.000000
rank_calinksi            27.000000
rank_davies            1364.000000
average_rank            885.000000
Name: 1480, dtype: float64

### Full model

In [208]:
hdbscan_model_first = HDBSCAN(
    min_cluster_size=60,
    metric='euclidean',
    min_samples=30,
    cluster_selection_epsilon=0.25,
    prediction_data=False)

text_cluster = hdbscan_model_first.fit(reduced_text_embeddings)
full_df['cluster'] = text_cluster.labels_

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [253]:
full_df[full_df.companyName.str.contains('Broadcom')]

Unnamed: 0,index,_id,symbol,price,beta,volAvg,mktCap,lastDiv,range,changes,...,dcfDiff,dcf,image,ipoDate,defaultImage,isEtf,isActivelyTrading,isAdr,isFund,cluster
958,1039,657dcf9ab2cffe5b07e22a27,AVGOP,1406.7,1.109868,0,0.0,180.0,1387.76-2099.51,-37.3201,...,30.3316,1859.56,https://financialmodelingprep.com/image-stock/...,2019-09-25,True,False,False,False,False,-1


In [228]:
full_df.cluster.value_counts()

cluster
-1     4324
 15    2070
 2     1896
 14    1443
 11    1399
 13    1121
 1      942
 17     840
 19     699
 20     407
 18     406
 12     314
 10     219
 6      170
 8      149
 5      139
 9       97
 4       90
 7       90
 3       86
 0       70
 16      66
Name: count, dtype: int64

In [230]:
full_df[full_df['cluster']==16][['symbol', 'companyName', 'description']]

Unnamed: 0,symbol,companyName,description
529,NEPT,Neptune Wellness Solutions Inc.,Neptune Wellness Solutions Inc. operates as an...
1034,NUS,"Nu Skin Enterprises, Inc.","Nu Skin Enterprises, Inc. develops and distrib..."
1755,CYAN,Cyanotech Corporation,Cyanotech Corporation engages in the cultivati...
2214,VITL,"Vital Farms, Inc.","Vital Farms, Inc., an ethical food company, pr..."
2410,BON,Bon Natural Life Limited,"Bon Natural Life Limited, together with its su..."
...,...,...,...
17750,WLCOF,The Well Told Company Inc.,"The Well Told Company Inc., a wellness company..."
17754,USBC,"US BioTec, Inc.","US BioTec, Inc. engages in the development, ma..."
17868,FITX,"Creative Edge Nutrition, Inc.","Creative Edge Nutrition, Inc. develops and sel..."
17916,VYND,"Vynleads, Inc.","Vynleads, Inc. provides health and wellness in..."
