# Top2Vec Model Build

In [1]:
from top2vec import Top2Vec

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [3]:
import pandas as pd

In [2]:
def df_to_list(speech_df):
    dflist = speech_df['sentence'].astype(str).tolist()
    newlist = []

    for text in dflist:
        newlist.append(text)
    return newlist

In [4]:
documents = df_to_list(pd.read_pickle("speech_documents.pkl"))

In [6]:
len(documents)

1957

In [7]:
print(documents[:500])



### Tokenized into Sentences

Our **documents** object is already tokenized into sentences as we transformed each speech before we combined them. Therefore we can proceed to run the model with this as our documents input.

In [28]:
model = Top2Vec(documents=documents, speed='deep-learn', workers=5)

2023-09-14 13:17:21,138 - top2vec - INFO - Pre-processing documents for training


2023-09-14 13:17:21,399 - top2vec - INFO - Creating joint document/word embedding
2023-09-14 13:18:01,860 - top2vec - INFO - Creating lower dimension embedding of documents
2023-09-14 13:18:08,582 - top2vec - INFO - Finding dense areas of documents
2023-09-14 13:18:08,664 - top2vec - INFO - Finding topics


#### Model Understanding

In [29]:
# number of topics Top2Vec has found in the data
print(f"Number of topics: {model.get_num_topics()}")

Number of topics: 17


In [31]:
# number of documents most similar to each topic
topic_words, word_scores, topic_nums = model.get_topics(17)
# topic_words: for each topic, the top 50 words are returned, in order of semantic similarity to topic
# word_scores: for each topic, the cosine similafrity scores of the top 50 words to the topic are returned
# topic_nums: the unique index of every topic will be returned

In [32]:
[print(num, each, '\n') for num, each in zip(topic_nums, topic_words)]

0 ['when' 'have' 'war' 'if' 'our' 'nation' 'many' 'do' 'time' 'women' 'them'
 'on' 'its' 'what' 've' 'american' 'would' 'military' 'more' 'some' 'not'
 'your' 'country' 'you' 'are' 'from' 'fight' 'region' 'over' 'iraq' 'or'
 'america' 'who' 'but' 'terrorists' 'so' 'afghanistan' 'can' 'it' 'be'
 'those' 'forces' 'they' 'own' 'by' 'been' 'us' 'to' 'he' 'there'] 

1 ['your' 'do' 'you' 'my' 'when' 'nation' 'peace' 'them' 'those' 'country'
 'they' 'who' 'have' 'we' 'been' 'so' 'what' 'on' 'region' 'terrorists'
 'one' 'by' 'military' 'be' 're' 'many' 'over' 'security' 'iran' 'can'
 'or' 'our' 'if' 'that' 'all' 'iraqis' 'america' 'has' 'iraqi' 'were'
 'also' 'for' 'his' 'want' 'people' 'freedom' 'help' 'forces' 'some'
 'states'] 

2 ['iraq' 'those' 'there' 'one' 'is' 'over' 'been' 'what' 'his' 'from'
 'afghanistan' 'that' 'not' 'in' 'has' 'we' 'an' 'america' 'when' 'no'
 'now' 'us' 'my' 'if' 'be' 'new' 'more' 'but' 'was' 'would' 'help' 'own'
 'it' 'so' 'forces' 'nation' 'to' 'at' 'by' 'this' 

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

### Tokenized into N-grams

### Tokenized into Words