# Training

## Topic modeling with BERTopic

In [1]:
from bertopic import BERTopic
from umap import UMAP
import json
import os
from sklearn.cluster import KMeans


CUDA initialization: The NVIDIA driver on your system is too old (found version 10010). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at  ../c10/cuda/CUDAFunctions.cpp:112.)



In [2]:
# Hyperparameters

N_TOPICS = 100

In [3]:
# dimensionality reduction with UMAP
umap_model = UMAP(
    n_neighbors=15, 
   n_components=5, 
    min_dist=0.0, 
    metric='cosine', 
    random_state=42
    )

# from sklearn.decomposition import PCA

# pca_model = PCA(n_components=50)

# # clustering with K means
# cluster_model = KMeans(n_clusters=N_TOPICS)

# # Initialize BERT topic
# topic_model = BERTopic(
#     #umap_model=umap_model,
#     umap_model=pca_model,
#     #nr_topics=N_TOPICS
#     hdbscan_model=cluster_model,
#     calculate_probabilities=True
#     )

topic_model = BERTopic(
     umap_model=umap_model,
     nr_topics=N_TOPICS,
     # hdbscan_model=cluster_model,
     calculate_probabilities=True
     )

In [4]:
# loading training data
from myutils.utils import using_downsampled_train_dataset

if using_downsampled_train_dataset:
    with open("./datasets/downsampled_train_dataset.json", "r") as f:
        train_dataset = json.load(f)
else:
    with open("./datasets/train_dataset.json", "r") as f:
        train_dataset = json.load(f)

In [5]:
len(train_dataset)

2391

In [6]:
train_dataset[0]

{'user': 'sfannah',
 'labeled_texts': [{'text': "wants mauds ice cream real bad   stupid england don't sell it",
   'polarity': 0},
  {'text': 'is soo not ready for maths', 'polarity': 0},
  {'text': 'has had terrible signal in culford so had not been on twitter or able to text',
   'polarity': 0},
  {'text': "i have and you haven't replied", 'polarity': 0},
  {'text': 'i wish i could give you that hug right now', 'polarity': 0},
  {'text': 'i secretly want to be a pokï¿½mon', 'polarity': 1},
  {'text': 'its at home  i miss it', 'polarity': 0},
  {'text': "why is everyone watching f1 but me  i'm stuck watching parent trap... oh the joy!",
   'polarity': 0},
  {'text': 'and this happens to be one of them', 'polarity': 1},
  {'text': "i don't think people would know how much that made me smile  i'm cheered up (y)",
   'polarity': 1},
  {'text': 'thinks people should group hug more', 'polarity': 1},
  {'text': 'sorry blame the phone!  me no means it', 'polarity': 0},
  {'text': ":o not th

In [7]:
all_training_docs = []
all_training_labels = []
for entry in train_dataset:
    entry_docs = [ labeled_text["text"] for labeled_text in entry["labeled_texts"]]
    # the documents are given the user tag
    entry_labels = [entry["label"] for labeled_text in entry["labeled_texts"] ]
    all_training_docs.extend(entry_docs)

In [8]:
len(all_training_docs)

24804

In [9]:
%%time
topics, probs = topic_model.fit_transform(
    all_training_docs[:2000],     
    )

CPU times: user 46min 59s, sys: 1min 20s, total: 48min 20s
Wall time: 2min 11s


In [10]:
probs

array([[0.03 , 0.028, 0.032, ..., 0.023, 0.016, 0.013],
       [0.017, 0.013, 0.014, ..., 0.035, 0.008, 0.008],
       [0.   , 0.   , 0.   , ..., 0.   , 0.   , 0.   ],
       ...,
       [0.027, 0.019, 0.019, ..., 0.059, 0.011, 0.01 ],
       [0.028, 0.023, 0.033, ..., 0.022, 0.014, 0.011],
       [0.019, 0.013, 0.015, ..., 0.037, 0.009, 0.008]])

In [11]:
%%time
topics, probs = topic_model.fit_transform(
    all_training_docs,
    )

CPU times: user 8h 33min 29s, sys: 8min 41s, total: 8h 42min 11s
Wall time: 22min 5s


In [25]:
from collections import Counter

In [26]:
Counter(topics)

Counter({66: 987,
         132: 208,
         -1: 16917,
         186: 230,
         69: 54,
         124: 73,
         192: 249,
         177: 102,
         101: 98,
         157: 494,
         185: 71,
         29: 437,
         121: 66,
         179: 100,
         184: 210,
         34: 92,
         199: 38,
         202: 85,
         48: 28,
         133: 56,
         158: 67,
         138: 31,
         53: 62,
         188: 30,
         36: 49,
         195: 52,
         130: 57,
         104: 43,
         28: 81,
         85: 224,
         190: 58,
         78: 63,
         127: 75,
         142: 59,
         160: 110,
         35: 302,
         23: 42,
         146: 33,
         57: 39,
         174: 96,
         105: 45,
         152: 37,
         92: 49,
         43: 24,
         122: 38,
         191: 66,
         140: 28,
         135: 57,
         200: 155,
         42: 29,
         70: 34,
         194: 40,
         79: 79,
         99: 36,
         136: 49,
         33: 2

In [27]:
import pickle

In [28]:
import os
training_dir = "training"

if not os.path.exists(training_dir):
    os.mkdir(training_dir)

In [29]:
with open (os.path.join(training_dir, "topic_model.pk"), "wb") as f:
    pickle.dump(topic_model, f)

In [30]:
#topic_model.save(
#    os.path.join(training_dir, "topic_model.pk"),
#    serialization = "pickle"
#)


# embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
# topic_model.save(
#     "topic_model_pytorch", 
#     serialization="pytorch", 
#     save_ctfidf=True, 
#     save_embedding_model=embedding_model
#     )

#topic_model.save(
#    os.path.join(training_dir, "topic_model.pk"),
#    serialization = "pickle"
#)

In [31]:
with open (os.path.join(training_dir, "topics.pk"), "wb") as f:
    pickle.dump(topics, f)

In [32]:
with open (os.path.join(training_dir, "training_probabilities.pk"), "wb") as f:
    pickle.dump(probs, f)

## Inspecting the topics

In [33]:
topic_model.topics

{-1: [('will', 0.0038787339195901593),
  ('love', 0.0038274577052269053),
  ('out', 0.00372639635996745),
  ('cant', 0.0037169898055984165),
  ('if', 0.003641473424951967),
  ('dont', 0.003639504604655357),
  ('lol', 0.003602867150902807),
  ('new', 0.0035159743712188823),
  ('when', 0.003489005951131784),
  ('time', 0.0034744827139442896)],
 0: [('wwwm2easia', 0.20106767757050154),
  ('dividends', 0.20106767757050154),
  ('shareholder', 0.20106767757050154),
  ('earn', 0.19707384544492124),
  ('free', 0.1560072709528419),
  ('funtastic', 0.005084042155069806),
  ('cards', 0.0036925345441980763),
  ('deck', 0.0035292826256454308),
  ('yourself', 0.0032614887506393787),
  ('play', 0.0021450831013279394)],
 1: [('vip', 0.17018624974770571),
  ('100', 0.16970314631794164),
  ('followers', 0.145676617602392),
  ('millionaire', 0.006519426365916509),
  ('subscribers', 0.006519426365916509),
  ('125', 0.006249808678520276),
  ('cats', 0.004570558941661742),
  ('congratulations', 0.0044601718

In [34]:
import nbformat

In [35]:
topic_model.visualize_topics()

In [43]:
probs.shape

(24804, 204)

In [42]:
probs[0]

array([0.001, 0.001, 0.001, 0.   , 0.002, 0.   , 0.008, 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.003, 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.003, 0.005, 0.   , 0.   , 0.003, 0.01 , 0.   , 0.   ,
       0.   , 0.003, 0.004, 0.   , 0.003, 0.005, 0.004, 0.005, 0.004,
       0.006, 0.005, 0.   , 0.   , 0.   , 0.003, 0.003, 0.003, 0.   ,
       0.007, 0.   , 0.007, 0.008, 0.   , 0.   , 0.   , 0.   , 0.004,
       0.   , 0.   , 0.005, 0.003, 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.012, 0.   , 0.12 , 0.   , 0.   , 0.004, 0.004, 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.007, 0.003, 0.   ,
       0.003, 0.005, 0.   , 0.   , 0.015, 0.   , 0.   , 0.   , 0.005,
       0.006, 0.   , 0.004, 0.   , 0.004, 0.   , 0.   , 0.   , 0.   ,
       0.004, 0.   , 0.005, 0.   , 0.005, 0.004, 0.01 , 0.003, 0.003,
       0.006, 0.   , 0.004, 0.   , 0.006, 0.   , 0.004, 0.005, 0.   ,
       0.   , 0.   , 0.005, 0.   , 0.003, 0.004, 0.   , 0.004, 0.   ,
       0.   , 0.003,

In [37]:
probs.shape

(24804, 204)