# **BERTopic - Tutorial**

In [184]:
#!pip install bertopic[visualization] --quiet
# ! pip install bertopic

# **Imports**

In [185]:
import numpy as np
import pandas as pd
from copy import deepcopy
from bertopic import BERTopic

# **Load data**

In [110]:
df = pd.read_csv("/content/Emails.csv")

In [115]:
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [116]:
# data = data[pd.notnull(data['ExtractedBodyText'])]
# print(data.sample(5)['ExtractedBodyText'])

In [126]:
data=df[['ExtractedBodyText']]
# df=data.copy()
data.head()

Unnamed: 0,ExtractedBodyText
0,
1,"b6\nthursday, march 3, 2011 9:45 pm\nh: latest..."
2,thx
3,
4,"h <hrod17@clintonemail.com>\nfriday, march 11,..."


In [118]:
import pandas as pd
import numpy as np
from sentence_transformers import *
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import MiniBatchKMeans
import matplotlib.pyplot as plt
import re
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import matplotlib.cm as cm
import torch
import sentence_transformers
# from lexrank_utility import *
import umap
import plotly
plotly.offline.init_notebook_mode (connected = True)

In [119]:
# Function to preprocess the tweets data
def preprocess_tweet_data(data,name):
    # Lowering the case of the words in the sentences
    data[name]=data[name].str.lower()
    # Code to remove the Hashtags from the text
    data[name]=data[name].apply(lambda x:re.sub(r'\B#\S+','',x))
    # Code to remove the links from the text
    data[name]=data[name].apply(lambda x:re.sub(r"http\S+", "", x))
    # Code to remove the Special characters from the text 
    data[name]=data[name].apply(lambda x:' '.join(re.findall(r'\w+', x)))
    # Code to substitute the multiple spaces with single spaces
    data[name]=data[name].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))
    # Code to remove all the single characters in the text
    data[name]=data[name].apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))
    # Remove the twitter handlers
    data[name]=data[name].apply(lambda x:re.sub('@[^\s]+','',x))
# This function is to remove stopwords from a particular column and to tokenize it
def rem_stopwords_tokenize(data,name):

    def getting(sen):
        example_sent = sen

        stop_words = set(stopwords.words('english')) 

        word_tokens = word_tokenize(example_sent) 

        filtered_sentence = [w for w in word_tokens if not w in stop_words] 

        filtered_sentence = [] 

        for w in word_tokens: 
            if w not in stop_words: 
                filtered_sentence.append(w.lower()) 
        return filtered_sentence
    x=[]
    for i in data[name].values:
        x.append(getting(i))
    data[name]=x
# Making a function to lemmatize all the words
lemmatizer = WordNetLemmatizer() 
def lemmatize_all(data,name):
    arr=data[name]
    a=[]
    for i in arr:
        b=[]
        for j in i:
            x=lemmatizer.lemmatize(j,pos='a')
            x=lemmatizer.lemmatize(x)
            b.append(x)
        a.append(b)
    data[name]=a
# Function to make it back into a sentence 

def make_sentences(data,name):
    data[name]=data[name].apply(lambda x:' '.join([i+' ' for i in x]))
    # Removing double spaces if created
    data[name]=data[name].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))

In [156]:
df.columns

Index(['Id', 'DocNumber', 'MetadataSubject', 'MetadataTo', 'MetadataFrom',
       'SenderPersonId', 'MetadataDateSent', 'MetadataDateReleased',
       'MetadataPdfLink', 'MetadataCaseNumber', 'MetadataDocumentClass',
       'ExtractedSubject', 'ExtractedTo', 'ExtractedFrom', 'ExtractedCc',
       'ExtractedDateSent', 'ExtractedCaseNumber', 'ExtractedDocNumber',
       'ExtractedDateReleased', 'ExtractedReleaseInPartOrFull',
       'ExtractedBodyText', 'RawText'],
      dtype='object')

In [133]:
data = data[pd.notnull(data['ExtractedBodyText'])]
print(data.sample(5)['ExtractedBodyText'])

4657    i think it's dec 10 at the winter palace in lu...
3554    i didn't - but hard to believe he did. i was p...
6649    this is the distilled, pure and utterly conven...
1104    we should do schedule so i have time w him whe...
2577    i am forwarding a few pictures i have received...
Name: ExtractedBodyText, dtype: object


In [164]:
timestamp_ = df[pd.notnull(df['ExtractedBodyText'])]['ExtractedDateSent']

(6742,)

In [134]:
# Using the preprocessing function to preprocess the tweet data
preprocess_tweet_data(data,  'ExtractedBodyText')
# Using tokenizer and removing the stopwords
rem_stopwords_tokenize(data, 'ExtractedBodyText')
# Converting all the texts back to sentences
make_sentences(data, 'ExtractedBodyText')

In [140]:
# Getting a model
model=SentenceTransformer('bert-large-nli-mean-tokens')

In [141]:
embeddings = model.encode(data['ExtractedBodyText'].values)

# **Creating Topics**

In [142]:
model = BERTopic(language="english")

In [143]:
model2 = BERTopic(language="english")
topics, probabilities = model2.fit_transform(data['ExtractedBodyText'],embeddings)

In [144]:
model2.get_topic_freq().head()

Unnamed: 0,Topic,Count
0,-1,2413
1,0,471
2,1,428
3,2,176
4,3,148


In [145]:
model2.get_topic(24)

[('647', 0.14999803976268644),
 ('please', 0.14540573034943774),
 ('202', 0.14124608457163496),
 ('immediate', 0.11137336710304307),
 ('assistance', 0.10269970879850872),
 ('5548', 0.09262895336573466),
 ('joanne', 0.09195722801725396),
 ('need', 0.0870893214567144),
 ('laszczych', 0.08628402325973852),
 ('traveling', 0.08146294702888833)]

In [186]:
model2.visualize_topics()

In [151]:
df['ExtractedDateSent'] = pd.to_datetime(df['MetadataDateSent'])

In [176]:
len(data['ExtractedBodyText'].to_list()), len(df['ExtractedDateSent'].to_list())

(6742, 7945)

In [178]:
model2.update_topics(data['ExtractedBodyText'], topics, n_gram_range=(1, 2))

In [180]:
topics, probabilities = model2.fit_transform(data['ExtractedBodyText'].values)

In [181]:
model2.visualize_topics()

In [187]:
### Model LDA

In [215]:
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [216]:
tokenizer = RegexpTokenizer(r'\w+')
texts = [tokenizer.tokenize(email.lower()) for email in data['ExtractedBodyText']]

In [217]:
def delete_stopwords(tokenized_sentence: list):
    return list(filter(lambda x: x not in stop_words, tokenized_sentence))

texts = list(filter(lambda x: len(x) > 5, [delete_stopwords(text) for text in texts]))

In [218]:
print(f"Number of Emails: {len(texts)}")

Number of Emails: 4061


In [219]:
from gensim import corpora
corpora_dict = corpora.Dictionary(texts)


In [209]:
corpora_dict

<gensim.corpora.dictionary.Dictionary at 0x7f30e5888d50>

In [220]:
corpus = [corpora_dict.doc2bow(text) for text in texts]

In [5]:
from gensim.models import LsiModel

In [6]:
model_lsi = LsiModel(corpus, id2word=corpora_dict.id2token, num_topics=10)

NameError: ignored

In [None]:
str_topics = [topic_w for topic_number, topic_w in model_lsi.print_topics()]
str_topics_split = list(map(lambda x: x.split("+"), str_topics))
str_topics_split = [list(map(lambda x: x.split("*")[1].strip()[1:-1], elem)) for elem in str_topic]

In [214]:
from gensim import matutils
from gensim.models.ldamodel import LdaModel

data = data[pd.notnull(data['ExtractedBodyText'])]

In [213]:
model_lda = LdaModel(corpus, passes=20, num_topics=10, id2word=corpora_dict.id2token)

ValueError: ignored

# We can then extract most frequent topics:

# Get Individual Topics