In [1]:
from neo4j_utility import *
from llm_extraction import *
from firecrawl_scraping import *
from utility import *
import os
from tqdm import tqdm
from dotenv import load_dotenv
from umap import UMAP
from hdbscan import HDBSCAN

import plotly.express as px
import plotly.graph_objects as go


  from .autonotebook import tqdm as notebook_tqdm


### Introduction

This notebook will conduct clustering analysis based on product description's embeddings.

### Collect embedding data from all extraction files

In [2]:
# Collect embedding data from targeted FinTech companies
description_lst = []
description_embedding_lst = []
product_name_lst = []
company_name_lst = []
processed_name_lst = []
url_lst = []

fintech_extraction_folder = os.getenv("fintech_extraction_folder")
doc_list = os.listdir(fintech_extraction_folder)
for doc in doc_list:
    
    try:        
        processed_name = doc.replace("_extraction.json", "")
        data = read_json_file(f'{fintech_extraction_folder}/{doc}')
        product_name_lst.append(data['summary_product_description']['name'])
        description_lst.append(data['summary_product_description']['description'])
        description_embedding_lst.append(data['summary_product_description']['description_embedding'])
        company_name_lst.append(data['name'])
        processed_name_lst.append(data['processed_company'])
        url_lst.append(data['url'])
    except Exception as e:
        print(f'Error at company {processed_name}: {e}')
    

Error at company .DS_Store: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte


In [4]:
# Collect embedding data from client's companies
client_extraction_folder = os.getenv("client_extraction_folder")
doc_list = os.listdir(client_extraction_folder)
for doc in doc_list:
    
    try:
        if doc in ['.DS_Store']:
            continue
        processed_name = doc.replace("_extraction.json", "")
        data = read_json_file(f'{client_extraction_folder}/{doc}')
        
        if data['summary_product_description']:
            product_name_lst.append(data['summary_product_description']['name'])
            description_lst.append(data['summary_product_description']['description'])
            description_embedding_lst.append(data['summary_product_description']['description_embedding'])
            company_name_lst.append(data['name'])
            processed_name_lst.append(data['processed_company'])
            url_lst.append(data['url'])
            
        else:
            print(f'Company {processed_name} does not have summary product description.')
    except Exception as e:
        print(f'Error at company {processed_name}: {e}')

Company d_l__henricksen does not have summary product description.
Company ska_street_brewstillery does not have summary product description.
Company the_nature_conservancy does not have summary product description.
Company nfm_lending does not have summary product description.
Company pinnacle_cu does not have summary product description.
Company chevron does not have summary product description.
Company cru_group does not have summary product description.
Company zer0es_tv does not have summary product description.
Company giant_eagle does not have summary product description.
Company cbibanks does not have summary product description.
Company vodafone_com_tr does not have summary product description.
Company texas_pace_authority does not have summary product description.
Company fathom_realty_llc does not have summary product description.
Company verra does not have summary product description.
Company kraus_anderson does not have summary product description.
Company rubenstein_part

In [6]:
# Create dataframe for the embedding data
df = pd.DataFrame(data = {
    'description': description_lst,
    'description_embedding': description_embedding_lst,
    'product_name': product_name_lst,
    'company_name': company_name_lst,
    'processed_name': processed_name_lst,
    'url': url_lst
})

df

Unnamed: 0,description,description_embedding,product_name,company_name,processed_name,url
0,A platform utilizing blockchain technology to ...,"[0.01961234211921692, -0.04968778416514397, -0...",Blockchain-Powered Phone System,HearRo,hearro,https://www.hearro.com
1,The software solutions provide tools for real-...,"[-0.010440151207149029, 0.0512225478887558, 0....",Construction Management and Production Trackin...,Raken,raken,https://www.rakenapp.com
2,Provides a range of accounting services and so...,"[-0.01257468294352293, 0.010148903355002403, 0...",Online Accounting Services,Crunch.,crunch_,https://www.crunch.co.uk
3,Provides an index capturing the peer-to-peer o...,"[-0.006143645849078894, -0.032648079097270966,...",Index and Finance Technology Platform,CrowdBureau,crowdbureau,https://www.crowdbureau.com
4,Provides a community for global citizens throu...,"[-0.007640076335519552, -0.008047102019190788,...",Global Community and Travel Platform,Yayem,yayem,https://www.yayem.co
...,...,...,...,...,...,...
2050,A range of personal and business banking produ...,"[-0.0005363529198803008, 0.007281074300408363,...","Bank Accounts, Savings, Loans, and Banking Ser...",Starling Bank,starling_bank,https://www.starlingbank.com
2051,Artificial arms capable of near-full human ran...,"[0.031287435442209244, 0.04793138802051544, -0...",AI-powered bionic arm,Atom Limbs,atom_limbs,https://www.atomlimbs.com
2052,Provides comprehensive HR and payroll solution...,"[-0.019249044358730316, 0.000286013848381117, ...",HR and Payroll Software,Paycom,paycom,https://www.paycom.com
2053,Comprehensive financial and business services ...,"[-0.015923814848065376, 0.02055956795811653, 0...",Financial and Business Services,Cleveland Group,cleveland_group,https://www.clevelandgroup.net


## HDBSCAN with Umap
Prior to clustering, dimensionality reduction is required. In a high-dimensional space, such as one with 1,536 dimensions, clustering becomes challenging due to the curse of dimensionality. 

For this analysis, the dimensionality of the embedding vectors was reduced to 5 dimensions using UMAP. 

Once the dimensionality was reduced, the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm was employed to identify clusters within the 5-dimensional space. 

In [137]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Load your embeddings
embeddings = df['description_embedding']
product_names = df['product_name']

if isinstance(embeddings, list):
    embeddings = np.array([np.array(embed) for embed in embeddings])
elif isinstance(embeddings, pd.Series):
    embeddings = np.stack(embeddings.values)
    
# Check the shape to ensure it's 2D
print('Embeddings shape:', embeddings.shape)

# Reduce dimensionality for visualization
umap_2d = UMAP(n_components=2, random_state=42)
umap_3d = UMAP(n_components=3, random_state=42)

embeddings_2d = umap_2d.fit_transform(embeddings)
embeddings_3d = umap_3d.fit_transform(embeddings)
embedding_umap = umap_model.fit_transform(embeddings)

# Perform HDBSCAN clustering
clusterer = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom')

cluster_labels = clusterer.fit_predict(embedding_umap)

df_2d = pd.DataFrame(embeddings_2d, columns=['Dim1', 'Dim2'])
df_2d['Cluster'] = [str(i) for i in cluster_labels]
df_2d['Product Name'] = product_names

df_3d = pd.DataFrame(embeddings_3d, columns=['Dim1', 'Dim2', 'Dim3'])
df_3d['Cluster'] = [str(i) for i in cluster_labels]
df_3d['Product Name'] = product_names

Embeddings shape: (2055, 1536)


In [138]:
# 2D Visualization
fig_2d = px.scatter(df_2d, x='Dim1', y='Dim2', color='Cluster', hover_data=['Product Name'],
                    title="2D UMAP Visualization of HDBSCAN Clusters",
                    labels={"Cluster": "Cluster ID"})
fig_2d.show()

# 3D Visualization
fig_3d = px.scatter_3d(df_3d, x='Dim1', y='Dim2', z='Dim3', color='Cluster', hover_data=['Product Name'],
                       title="3D UMAP Visualization of HDBSCAN Clusters",
                       labels={"Cluster": "Cluster ID"})
fig_3d.show()


In [111]:
df_3d.head()

Unnamed: 0,Dim1,Dim2,Dim3,Cluster,Product Name
0,14.487803,-0.166915,6.745245,25,Blockchain-Powered Phone System
1,14.358218,-1.370452,8.004601,26,Construction Management and Production Trackin...
2,14.600207,1.467352,5.540948,23,Online Accounting Services
3,12.786939,1.186063,8.983631,0,Index and Finance Technology Platform
4,12.517757,1.49098,7.734725,-1,Global Community and Travel Platform


## BERTopic (Final Method)
Use BERTopic to apply UMAP for dimensionality reduction, HDBSCAN for clustering and GPT-4o for generating cluster names.

In [112]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
import openai
from bertopic.representation import OpenAI

# Step 1 - Load embeddings
embeddings = df['description_embedding']
product_names = df['product_name']
documents = df['description']

if isinstance(embeddings, list):
    embeddings = np.array([np.array(embed) for embed in embeddings])
elif isinstance(embeddings, pd.Series):
    embeddings = np.stack(embeddings.values)

if isinstance(documents, list):
    documents = np.array([np.array(doc) for doc in documents])
elif isinstance(documents, pd.Series):
    documents = np.stack(documents.values)


# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# # Step 6 - (Optional) Fine-tune topic representations with a `bertopic.representation` model

# Fine-tune topic representations with GPT
client = openai.OpenAI(api_key=os.getenv('OPENAI_KEY'))
representation_model = OpenAI(client, model="gpt-4o", chat=True)
topic_model = BERTopic(representation_model=representation_model)

# Initialize BERTopic without the embedding model
topic_model = BERTopic(
    umap_model=umap_model,                    # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
    representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations
    
)

# Fit the topic model with the precomputed embeddings
topics, probabilities = topic_model.fit_transform(documents, embeddings)


In [113]:
df_topic = topic_model.get_topic_info()

# df_topic.to_csv('data/topic_extraction_detailed.csv', index = False)
# df_topic = pd.read_csv('data/topic_extraction_detailed.csv')

In [114]:
df_topic.head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,536,-1_Integrated Digital and Business Management ...,[Integrated Digital and Business Management So...,[Provides comprehensive business verification ...
1,0,110,0_Real Estate Management and Marketing Platforms,[Real Estate Management and Marketing Platforms],[A comprehensive platform offering tools for b...
2,1,109,1_Comprehensive Banking and Financial Services,[Comprehensive Banking and Financial Services],[Offering a range of banking services and loan...
3,2,106,2_Comprehensive Real Estate Services and Prope...,[Comprehensive Real Estate Services and Proper...,[Provides a variety of commercial real estate ...
4,3,91,3_Comprehensive Payment Processing and Busines...,[Comprehensive Payment Processing and Business...,[Offers a comprehensive range of payment and f...
5,4,90,4_Comprehensive Construction and Building Serv...,[Comprehensive Construction and Building Servi...,[Provides a wide range of construction and pro...
6,5,90,5_Comprehensive Financial Data and Trading Ana...,[Comprehensive Financial Data and Trading Anal...,[A modern data analytics platform offering com...
7,6,74,6_AI-Powered Data and Risk Management Solution...,[AI-Powered Data and Risk Management Solutions...,[AI and analytics platforms designed for finan...
8,7,68,7_Decentralized Platforms for Content Creation...,[Decentralized Platforms for Content Creation ...,[Provides a decentralized platform allowing cr...
9,8,62,8_Tailored Investment and Capital Solutions fo...,[Tailored Investment and Capital Solutions for...,"[Provide capital raising and management, wealt..."


In [116]:
# Get cluster names
cluster_labels = list(df_topic['Name'])
cluster_label_map = {}

for label in cluster_labels:
    key = label.split('_')[0]
    cluster_label_map[key] = label

topics_detected = [cluster_label_map[str(i)] for i in topics]

df['Cluster'] = topics_detected
# df.to_csv('data/company_clustering_result_detailed.csv', index = False)

In [118]:
# Dimensionality reduction for visualisation
umap_2d = UMAP(n_components=2, random_state=42)
umap_3d = UMAP(n_components=3, random_state=42)

embeddings_2d = umap_2d.fit_transform(embeddings)
embeddings_3d = umap_3d.fit_transform(embeddings)

df_2d = pd.DataFrame(embeddings_2d, columns=['Dim1', 'Dim2'])
df_2d['Cluster'] = topics_detected
df_2d['Product Name'] = product_names

df_3d = pd.DataFrame(embeddings_3d, columns=['Dim1', 'Dim2', 'Dim3'])
df_3d['Cluster'] = topics_detected
df_3d['Product Name'] = product_names

In [119]:
# 2D Visualization
fig_2d = px.scatter(df_2d, x='Dim1', y='Dim2', color='Cluster', hover_data=['Product Name'],
                    title="2D UMAP Visualization of HDBSCAN Clusters",
                    labels={"Cluster": "Cluster Name"})

fig_2d.show()

# 3D Visualization
fig_3d = px.scatter_3d(df_3d, x='Dim1', y='Dim2', z='Dim3', color='Cluster', hover_data=['Product Name'],
                       title="3D UMAP Visualization of HDBSCAN Clusters",
                       labels={"Cluster": "Cluster Name"})
fig_3d.show()


### Analytics

Note: -1 label means unclassified due to the noise-aware nature of HDBSCAN algorithm.

In [139]:
df = pd.read_csv('data/company_clustering_result_detailed.csv')
df['Cluster'] = df['Cluster'].replace('-1_Comprehensive Financial and Business Management Platforms', '-1_Unclassified')
df.head()

Unnamed: 0,description,description_embedding,product_name,company_name,processed_name,url,Cluster
0,A platform utilizing blockchain technology to ...,"[0.01961234211921692, -0.04968778416514397, -0...",Blockchain-Powered Phone System,HearRo,hearro,https://www.hearro.com,7_Blockchain-Based Platforms for Digital Conte...
1,The software solutions provide tools for real-...,"[-0.010440151207149029, 0.0512225478887558, 0....",Construction Management and Production Trackin...,Raken,raken,https://www.rakenapp.com,21_Construction Project Management Software So...
2,Provides a range of accounting services and so...,"[-0.01257468294352293, 0.010148903355002403, 0...",Online Accounting Services,Crunch.,crunch_,https://www.crunch.co.uk,"0_Comprehensive Tax, Accounting, and Advisory ..."
3,Provides an index capturing the peer-to-peer o...,"[-0.006143645849078894, -0.032648079097270966,...",Index and Finance Technology Platform,CrowdBureau,crowdbureau,https://www.crowdbureau.com,10_Mortgage and Lending Solutions
4,Provides a community for global citizens throu...,"[-0.007640076335519552, -0.008047102019190788,...",Global Community and Travel Platform,Yayem,yayem,https://www.yayem.co,-1_Unclassified


In [153]:

def pie_chart_analysis(df):
    # Step 1: Group by the 'cluster' column and count the number of companies in each cluster
    cluster_counts = df['Cluster'].value_counts().reset_index()
    cluster_counts.columns = ['Cluster', 'count']

    # Step 2: Sort clusters by count and select the top 10 clusters
    top_clusters = cluster_counts.nlargest(11, 'count')

    # Step 3: Calculate the remaining clusters and group them under 'Other'
    remaining_clusters_count = cluster_counts['count'].sum() - top_clusters['count'].sum()
    other_cluster = pd.DataFrame({'Cluster': ['Other'], 'count': [remaining_clusters_count]})

    # Combine the top clusters and "Other" category using pd.concat
    top_clusters = pd.concat([top_clusters, other_cluster], ignore_index=True)

    # Step 4: Generate the pie chart
    fig = px.pie(top_clusters, values='count', names='Cluster', title='Top Clusters with Largest Proportion')

    # Show the plot
    fig.show()

In [154]:
pie_chart_analysis(df)

In [155]:
# Merge with the FinTech dataset to focus on the targeted FinTech companies
df_target = pd.read_csv('data/merge_url_companies.csv')
df_target['target'] = True
df_target_shorten = df_target[['processed_name','target']]
df_merge = df.merge(df_target_shorten, on = 'processed_name', how = 'left')
df_merge.head()

Unnamed: 0,description,description_embedding,product_name,company_name,processed_name,url,Cluster,target
0,A platform utilizing blockchain technology to ...,"[0.01961234211921692, -0.04968778416514397, -0...",Blockchain-Powered Phone System,HearRo,hearro,https://www.hearro.com,7_Blockchain-Based Platforms for Digital Conte...,True
1,The software solutions provide tools for real-...,"[-0.010440151207149029, 0.0512225478887558, 0....",Construction Management and Production Trackin...,Raken,raken,https://www.rakenapp.com,21_Construction Project Management Software So...,True
2,Provides a range of accounting services and so...,"[-0.01257468294352293, 0.010148903355002403, 0...",Online Accounting Services,Crunch.,crunch_,https://www.crunch.co.uk,"0_Comprehensive Tax, Accounting, and Advisory ...",True
3,Provides an index capturing the peer-to-peer o...,"[-0.006143645849078894, -0.032648079097270966,...",Index and Finance Technology Platform,CrowdBureau,crowdbureau,https://www.crowdbureau.com,10_Mortgage and Lending Solutions,True
4,Provides a community for global citizens throu...,"[-0.007640076335519552, -0.008047102019190788,...",Global Community and Travel Platform,Yayem,yayem,https://www.yayem.co,-1_Unclassified,True


In [156]:
df_merge_target = df_merge[df_merge['target'] == True]
df_merge_client = df_merge[df_merge['target'] != True]
pie_chart_analysis(df_merge_target)
pie_chart_analysis(df_merge_client)

In [157]:
df_merge_target[df_merge_target['Cluster'] == '2_Real Estate Management and Marketing Platforms'].head()

Unnamed: 0,description,description_embedding,product_name,company_name,processed_name,url,Cluster,target
7,"Provides a wide range of real estate data, val...","[-0.003999155014753342, 0.012150825001299381, ...",AI-powered Real Estate Data and Analytics Plat...,HouseCanary,housecanary,https://www.housecanary.com,2_Real Estate Management and Marketing Platforms,True
8,Provides a platform powered by data science th...,"[-0.009919610805809498, 0.000763335672672838, ...",Real estate investment platform,Manor Straits,manor_straits,https://www.manorstraits.com,2_Real Estate Management and Marketing Platforms,True
18,The company provides comprehensive property da...,"[0.02346922643482685, 0.012462479062378407, 0....",Complete Property & Owner Database and Prospec...,ProspectNow,prospectnow,https://www.prospectnow.com,2_Real Estate Management and Marketing Platforms,True
27,Provides a suite of tools for real estate prof...,"[-0.029642626643180847, 0.02281319908797741, 0...",Real Estate Analytics and Management Platforms,Sisu),sisu,https://www.sisu.co,2_Real Estate Management and Marketing Platforms,True
31,A cloud-based data platform designed for comme...,"[0.004274421371519566, 0.03482777997851372, 0....",Data Curation & Compliance Platform,Mango REIX,mango_reix,https://www.mangoreix.com,2_Real Estate Management and Marketing Platforms,True


In [158]:
# # Merge with the FinTech dataset to focus on the targeted FinTech companies with valuation > 100 million
df_pitch = pd.read_csv('data/merge_url_companies.csv')
df_high_valuation = df_pitch[df_pitch['last_known_valuation'] >= 100]
df_high_valuation = df_high_valuation.merge(df_merge_target, on = 'processed_name', how = 'left')

# Step 1: Group by the 'cluster' column and count the number of companies in each cluster
cluster_counts = df_high_valuation['Cluster'].value_counts().reset_index()
cluster_counts.columns = ['Cluster', 'count']

# Step 2: Sort clusters by count and select the top 15 clusters
top_clusters = cluster_counts.nlargest(15, 'count')

# Step 4: Generate the pie chart
fig = px.pie(top_clusters, values='count', names='Cluster', title='Top Clusters with Largest Proportion')

# Show the plot
fig.show()

In [159]:
# Clusters found in the biggest community
community_discover = ["2_Real Estate Management and Marketing Platforms",
                      "12_Comprehensive Commercial Real Estate Services and Investments",
                      "-1_Unclassified",
                      "9_Comprehensive Property Management Solutions for Landlords and Real Estate Markets",
                      "14_Comprehensive IT and Security Solutions for Business Operations",
                      "15_Comprehensive ESG and Carbon Management Solutions",
                      "16_Insurance Automation and Analytics Platforms",
                      "1_Comprehensive Banking and Financial Services",
                      "8_Private Investment and Capital Management Solutions for Growth and Startups",
                      "3_Comprehensive Construction and Building Services",
                      "25_Comprehensive Community and Social Services Development"]
cluster_counts_community = pd.DataFrame({'Cluster':community_discover, 'count':[10, 9, 9, 4, 1, 1, 1, 1, 1, 1, 1]})


# Step 4: Generate the pie chart
fig = px.pie(cluster_counts_community, values='count', names='Cluster', title='Top Clusters with Largest Proportion')

# Show the plot
fig.show()