<a href="https://colab.research.google.com/github/p-koenig/msg-datathon/blob/main/bertopic_msg_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install bertopic

Collecting bertopic
  Using cached bertopic-0.15.0-py2.py3-none-any.whl (143 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Using cached hdbscan-0.8.33.tar.gz (5.2 MB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.4.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.8/90.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cython<3,>=0.27 (from hdbscan>=0.8.29->bertopic)
  Using cached Cython-0.29.36-cp310-c

In [2]:
%cd drive/MyDrive/msg-challenge

/content/drive/MyDrive/msg-challenge


In [4]:
from bertopic import BERTopic
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np

model = BERTopic(calculate_probabilities=True)
df = pd.read_csv("preprocessed_data.csv", index_col=0).reset_index()

nltk.download('stopwords')
nltk.download('punkt')
stoplist = set(stopwords.words("english"))

def remove_stopwords(document):
    word_lst =[]
    # split the text by whitespace
    for word in document.split():
        if word.lower() not in stoplist:
            word_lst.append(word)
    # return the document as a string
    return ' '.join(word_lst)

no_stopwords_df = df.copy()
no_stopwords_df["content"] = no_stopwords_df["content"].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
category_lst = ["political instability",
                "geopolitical factors",
                "currency fluctuations",
                "investment demand",
                "supply and demand",
                "industrial demand",
                "natural disasters"]

# function which creates an array for a word based on the similarities scores returned by the find_topics method
def create_category_vector(category, num_topics):
    topics = model.find_topics(category, top_n=num_topics)
    topic_lst = topics[0]
    prob_lst = topics[1]
    array = np.zeros(num_topics)
    for lst_idx, topic_idx in enumerate(topic_lst):
        # ignore -1 which means outlier (topic)
        if topic_idx != -1:
            array[topic_idx] = prob_lst[lst_idx]
    return array

# function to iterate over all documents probabilities
def create_doc_embeddings(similarities):
    """
    # create an embedding for every document in every category, e.g. political stability and return 7 dimensional array

    # every document has a similarity score for every topic created by BERTopic
    # compute the cosine similarity of the similarity score vector of the document and the category
    """
    num_categories = len(category_lst)
    embedding_lst = []
    for doc in similarities:
        array = np.zeros(num_categories)
        for array_idx, category_embedding in enumerate(category_embeddings):
            doc_embedding = np.array(doc)
            dot_product = np.dot(category_embedding, doc_embedding)
            norm_doc = np.linalg.norm(doc_embedding)
            norm_category = np.linalg.norm(category_embedding)
            cosine_similarity = dot_product / (norm_doc * norm_category)
            array[array_idx] = cosine_similarity
        embedding_lst.append(array)
    return embedding_lst


In [7]:
topics, similarities = model.fit_transform(no_stopwords_df["content"])

# creating an embedding for each element in the category_lst
num_topics = len(np.unique(topics)) - 1
category_embeddings = [create_category_vector(category, num_topics) for category in category_lst]

# creating embedding and saving as a df
embedding_lst = create_doc_embeddings(similarities)
embedding_df = pd.DataFrame(embedding_lst)
embedding_df.columns = category_lst
embedding_df['date'] = df['date']

In [None]:
from umap import UMAP
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.cluster import KMeans
from hdbscan import HDBSCAN

# hyperparameters
embedding_model_lst = ["all-MiniLM-L12-v2", "all-mpnet-base-v2", "all-distilroberta-v1"]

umap = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
empty_dimensionality_model = BaseDimensionalityReduction()
umap_model_lst = [umap, empty_dimensionality_model]
umap_model_names = ["umap", "empty"]

# grid search
for embedding_model in embedding_model_lst:
    for umap_model, umap_model_name in zip(umap_model_lst, umap_model_names):
        model = BERTopic(embedding_model=embedding_model,
                          umap_model=umap_model,
                          calculate_probabilities=True)
        topics, similarities = model.fit_transform(no_stopwords_df["content"])

        num_topics = len(np.unique(topics)) - 1
        category_embeddings = [create_category_vector(category, num_topics) for category in category_lst]

        embedding_lst = create_doc_embeddings(similarities)

        # embeddings without normalization
        embedding_df = pd.DataFrame(embedding_lst)
        embedding_df.columns = category_lst
        embedding_df['date'] = df['date']
        embedding_df.to_csv(f"{embedding_model}_{umap_model_name}.csv")

        # embeddings with normalization
        norm_embedding_df = embedding_df.copy()
        norm_embedding_df.iloc[:,0:-1] = norm_embedding_df.iloc[:,0:-1].apply(lambda x: (x-x.mean())/ x.std(), axis=0)
        norm_embedding_df.to_csv(f"{embedding_model}_{umap_model_name}_normalized.csv")

  in_cluster_probs = all_points_prob_in_some_cluster(
  cosine_similarity = dot_product / (norm_doc * norm_category)


Downloading (…)99753/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)0cdb299753/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)db299753/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)753/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)99753/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)9753/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)0cdb299753/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)b299753/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

  in_cluster_probs = all_points_prob_in_some_cluster(
  cosine_similarity = dot_product / (norm_doc * norm_category)


Downloading (…)87e68/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)5afc487e68/README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading (…)fc487e68/config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e68/data_config.json:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading (…)afc487e68/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)87e68/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading (…)7e68/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)afc487e68/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)c487e68/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]