In [198]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
from hdbscan import HDBSCAN
from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain.embeddings import HuggingFaceEmbeddings

from finance.mongo.extraction import query_mongodb
from finance.params import *
import numpy as np
import pandas as pd

In [199]:
EXCHANGE_LS = ['NASDAQ', 'NYSE', 'PNK']

exchange_ls = EXCHANGE_LS
table_name = 'company_profile'

In [200]:
result = query_mongodb(exchange_ls=exchange_ls, table=table_name)

exchange {} - 45600


In [202]:
full_df = pd.DataFrame(result)

In [203]:
full_df = full_df.drop_duplicates(subset='companyName')
full_df.reset_index(inplace=True)
full_df.description

0                                                     None
1        LAVA Medtech Acquisition Corp. does not have s...
2        Capital Product Partners L.P., a shipping comp...
3        Comtech Telecommunications Corp., together wit...
4        Vallon Pharmaceuticals, Inc., a biopharmaceuti...
                               ...                        
18210    Advanced Oxygen Technologies, Inc., through it...
18211    OPTiM Corporation provides various internet-ba...
18212    Hollywood Intermediate, Inc. engages in the de...
18213    OSAKA Titanium technologies Co.,Ltd. manufactu...
18214                                                 None
Name: description, Length: 18215, dtype: object

In [204]:
full_df.fillna(np.nan, inplace=True)
full_df.dropna(subset='description', inplace=True)

In [205]:
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [206]:
embedded_docs = embeddings.embed_documents(full_df.description.to_list())

In [207]:
umap_model = UMAP(n_neighbors=20, n_components=5, min_dist=0.00, metric='cosine')
reduced_text_embeddings = umap_model.fit_transform(embedded_docs)

### Cross validation

- ``` min_cluster_size ``` - smalles size grouping to be considered as a cluster
- ``` min_samples ``` - larger - more points considered as noise as clusters more restriced to progressively more dense areas
- ```cluster_selection_epsilon ``` - ensures clusters below given treshold are not split up further
- ``` alpha ``` - conservative clustering - best to leave

In [110]:
%env

TOKENIZERS_PARALLELISM = False

In [262]:
from itertools import product
from tqdm import tqdm

from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score


def dbscan_gridsearch(data, min_cluster_size_ls, min_samples_ls, cluster_selection_epsilon_ls, n_neighbors_ls, n_components_ls, min_dist_ls):
    
    experiment_ls = []
    
    for n_neighbors, n_components, min_dist in tqdm(list(product(n_neighbors_ls, n_components_ls, min_dist_ls))):
        
        umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=min_dist)
        
        reduced_text_embeddings = umap_model.fit_transform(data)
    
        for min_clust, min_sample, epsilon in tqdm(list(product(min_cluster_size_ls, min_samples_ls, cluster_selection_epsilon_ls))):
            
            umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=min_dist)
            
            reduced_text_embeddings = umap_model.fit_transform(data)
            
            hdbscan_model = HDBSCAN(
                min_cluster_size=min_clust,
                metric='euclidean',
                min_samples=min_sample,
                cluster_selection_epsilon=epsilon,
                prediction_data=False)
            
            text_cluster = hdbscan_model.fit(reduced_text_embeddings)
            
            unique, counts = np.unique(text_cluster.labels_, return_counts=True)
                    
            silhouette = silhouette_score(reduced_text_embeddings, text_cluster.labels_, metric='euclidean')
            calinksi = calinski_harabasz_score(reduced_text_embeddings, text_cluster.labels_)
            davies = davies_bouldin_score(reduced_text_embeddings, text_cluster.labels_)
            percentage_outlier = counts[0]/np.sum(counts)
            
            run_dict = {
                'min_cluster_size':min_clust,
                'min_samples':min_sample,
                'epsilon':epsilon,
                'n_neighbors':n_neighbors,
                'n_components':n_components,
                'min_dist':min_dist,
                'silhouette_score': silhouette,
                'calinski': calinksi,
                'davies': davies,
                'percentage_outliers': percentage_outlier,
                'count_classes' : len(unique)
            }
            
            experiment_ls.append(run_dict)
            
    return pd.DataFrame(experiment_ls)
        
        
        
    


In [263]:
min_cluster_size = [5, 10, 15, 20]
min_samples = [5, 10, 20, 30]
cluster_selection_epsilon = [0.25]
n_neighbors = [5,10,15,20,30]
n_components = [3, 5, 7, 10, 12]
min_dist = [0.00, 0.01, 0.05, 0.1]

In [265]:
df = dbscan_gridsearch(reduced_text_embeddings, min_cluster_size, min_samples, cluster_selection_epsilon, n_neighbors, n_components, min_dist)

df

100%|██████████| 16/16 [02:10<00:00,  8.16s/it]
100%|██████████| 16/16 [02:14<00:00,  8.40s/it]it]
100%|██████████| 16/16 [02:15<00:00,  8.44s/it]it]
100%|██████████| 16/16 [02:08<00:00,  8.01s/it]it]
100%|██████████| 16/16 [02:04<00:00,  7.75s/it]it]
100%|██████████| 16/16 [02:01<00:00,  7.58s/it]it]
100%|██████████| 16/16 [02:09<00:00,  8.08s/it]it]
100%|██████████| 16/16 [01:56<00:00,  7.25s/it]it]
100%|██████████| 16/16 [01:47<00:00,  6.69s/it]it]
100%|██████████| 16/16 [01:57<00:00,  7.34s/it]it]
100%|██████████| 16/16 [02:14<00:00,  8.39s/it]/it]
100%|██████████| 16/16 [01:54<00:00,  7.16s/it]/it]
100%|██████████| 16/16 [01:59<00:00,  7.45s/it]/it]
100%|██████████| 16/16 [01:58<00:00,  7.39s/it]/it]
100%|██████████| 16/16 [02:26<00:00,  9.15s/it]/it]
100%|██████████| 16/16 [03:05<00:00, 11.61s/it]/it]
100%|██████████| 16/16 [03:02<00:00, 11.38s/it]/it]
100%|██████████| 16/16 [02:41<00:00, 10.06s/it]/it]
100%|██████████| 16/16 [02:28<00:00,  9.27s/it]/it]
100%|██████████| 16/16 [0

In [189]:
df = df.sort_values('silhouette_score')
df

Unnamed: 0,min_cluster_size,min_samples,epsilon,silhouette_score,calinski,davies,percentage_outliers,count_classes
4,20,10,0.25,-0.460873,384.617946,2.338334,0.138581,15
0,20,5,0.25,-0.380258,408.556347,2.456342,0.082115,14
16,40,5,0.25,-0.364953,526.235703,2.608840,0.086811,11
20,40,10,0.25,-0.362908,527.926471,2.627801,0.144509,11
48,80,5,0.25,-0.325171,209.853830,2.810212,0.097494,8
...,...,...,...,...,...,...,...,...
19,40,5,2.00,0.810630,3183.544670,0.135424,0.004109,2
18,40,5,1.00,0.810630,3183.544670,0.135424,0.004109,2
15,20,30,2.00,0.810630,3183.544670,0.135424,0.004109,2
46,60,30,1.00,0.810630,3183.544670,0.135424,0.004109,2


In [190]:
df['rank_silhouette'] = df.silhouette_score.rank(ascending=False, method='dense')
df['rank_calinksi'] = df.calinski.rank(ascending=False, method='dense')
df['rank_davies'] = df.davies.rank(ascending=True, method='dense')
df['average_rank'] = (df.rank_silhouette + df.rank_calinksi + df.rank_davies)/3

In [192]:
df = df[df['count_classes']>2]

In [196]:
df.sort_values('average_rank', ascending=True).head(30)

Unnamed: 0,min_cluster_size,min_samples,epsilon,silhouette_score,calinski,davies,percentage_outliers,count_classes,rank_silhouette,rank_calinksi,rank_davies,average_rank
44,60,30,0.25,0.198197,4212.416461,1.198699,0.29882,26,3.0,3.0,3.0,3.0
40,60,20,0.25,0.166484,5226.344223,1.237925,0.210248,19,9.0,1.0,5.0,5.0
28,40,30,0.25,0.195911,3544.984624,1.261038,0.30017,31,5.0,5.0,6.0,5.333333
12,20,30,0.25,0.173874,3090.256082,1.232927,0.290075,37,7.0,8.0,4.0,6.333333
88,120,20,0.25,0.199907,3387.969008,1.385598,0.213594,12,2.0,6.0,13.0,7.0
92,120,30,0.25,0.19648,2314.313522,1.293146,0.308916,18,4.0,11.0,9.0,8.0
60,80,30,0.25,0.187296,1909.846459,1.265308,0.310442,22,6.0,13.0,7.0,8.666667
76,100,30,0.25,0.173615,2070.380639,1.283232,0.320831,20,8.0,12.0,8.0,9.333333
56,80,20,0.25,0.153,2517.428791,1.368596,0.216998,16,10.0,10.0,10.0,10.0
24,40,20,0.25,0.068143,4400.61485,1.376778,0.202383,23,16.0,2.0,12.0,10.0


### Full model

In [208]:
hdbscan_model = HDBSCAN(
    min_cluster_size=60,
    metric='euclidean',
    min_samples=30,
    cluster_selection_epsilon=0.25,
    prediction_data=False)

text_cluster = hdbscan_model.fit(reduced_text_embeddings)
full_df['cluster'] = text_cluster.labels_

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [253]:
full_df[full_df.companyName.str.contains('Broadcom')]

Unnamed: 0,index,_id,symbol,price,beta,volAvg,mktCap,lastDiv,range,changes,...,dcfDiff,dcf,image,ipoDate,defaultImage,isEtf,isActivelyTrading,isAdr,isFund,cluster
958,1039,657dcf9ab2cffe5b07e22a27,AVGOP,1406.7,1.109868,0,0.0,180.0,1387.76-2099.51,-37.3201,...,30.3316,1859.56,https://financialmodelingprep.com/image-stock/...,2019-09-25,True,False,False,False,False,-1


In [228]:
full_df.cluster.value_counts()

cluster
-1     4324
 15    2070
 2     1896
 14    1443
 11    1399
 13    1121
 1      942
 17     840
 19     699
 20     407
 18     406
 12     314
 10     219
 6      170
 8      149
 5      139
 9       97
 4       90
 7       90
 3       86
 0       70
 16      66
Name: count, dtype: int64

In [230]:
full_df[full_df['cluster']==16][['symbol', 'companyName', 'description']]

Unnamed: 0,symbol,companyName,description
529,NEPT,Neptune Wellness Solutions Inc.,Neptune Wellness Solutions Inc. operates as an...
1034,NUS,"Nu Skin Enterprises, Inc.","Nu Skin Enterprises, Inc. develops and distrib..."
1755,CYAN,Cyanotech Corporation,Cyanotech Corporation engages in the cultivati...
2214,VITL,"Vital Farms, Inc.","Vital Farms, Inc., an ethical food company, pr..."
2410,BON,Bon Natural Life Limited,"Bon Natural Life Limited, together with its su..."
...,...,...,...
17750,WLCOF,The Well Told Company Inc.,"The Well Told Company Inc., a wellness company..."
17754,USBC,"US BioTec, Inc.","US BioTec, Inc. engages in the development, ma..."
17868,FITX,"Creative Edge Nutrition, Inc.","Creative Edge Nutrition, Inc. develops and sel..."
17916,VYND,"Vynleads, Inc.","Vynleads, Inc. provides health and wellness in..."
