# Top2Vec Model Build

In [3]:
from top2vec import Top2Vec

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [4]:
import pandas as pd

In [3]:
def df_to_list(speech_df):
    dflist = speech_df['sentence'].astype(str).tolist()
    newlist = []

    for text in dflist:
        newlist.append(text)
    return newlist

In [57]:
documents = df_to_list(pd.read_pickle("speech_documents.pkl"))

In [58]:
len(documents)

2293

In [59]:
print(documents[:500])



### Tokenized into Sentences

Our **documents** object is already tokenized into sentences as we transformed each speech before we combined them. Therefore we can proceed to run the model with this as our documents input.

In [60]:
model = Top2Vec(documents=documents, speed='deep-learn', workers=5)

2023-09-16 16:23:36,900 - top2vec - INFO - Pre-processing documents for training
2023-09-16 16:23:37,075 - top2vec - INFO - Creating joint document/word embedding
2023-09-16 16:24:19,372 - top2vec - INFO - Creating lower dimension embedding of documents
2023-09-16 16:24:27,575 - top2vec - INFO - Finding dense areas of documents
2023-09-16 16:24:27,697 - top2vec - INFO - Finding topics


#### Model Understanding

In [61]:
# number of topics Top2Vec has found in the data
print(f"Number of topics: {model.get_num_topics()}")

Number of topics: 25


In [62]:
# number of documents most similar to each topic
topic_words, word_scores, topic_nums = model.get_topics(17)
# topic_words: for each topic, the top 50 words are returned, in order of semantic similarity to topic
# word_scores: for each topic, the cosine similafrity scores of the top 50 words to the topic are returned
# topic_nums: the unique index of every topic will be returned

In [63]:
[print(num, each, '\n') for num, each in zip(topic_nums, topic_words)]

0 ['east' 'middle' 'forces' 'do' 'what' 'democracy' 'government' 'when'
 'will' 'make' 'afghan' 'help' 'these' 'work' 'support' 'are' 'end' 'an'
 'continue' 'must' 'just' 'people' 'all' 'its' 'new' 'that' 'so' 'be'
 'them' 'it' 'military' 'more' 'as' 'with' 'up' 'their' 're' 'security'
 'nations' 'they' 'freedom' 'peace' 'we' 'american' 'the' 'to' 'can' 'men'
 'out' 'not'] 

1 ['iraqis' 'nations' 'iran' 'its' 'must' 'other' 'iraq' 'at' 'them' 'work'
 'all' 'from' 'after' 'by' 'will' 'the' 'up' 'as' 'america' 'end' 'are'
 'government' 'out' 'help' 'mission' 'with' 'it' 'than' 'continue' 'now'
 'be' 'more' 'had' 'have' 'terrorists' 'and' 'world' 'this' 'region'
 'peace' 'just' 'would' 'to' 'or' 'their' 'last' 'security' 'there' 'me'
 'when'] 

2 ['iraqis' 'america' 'terrorists' 'region' 'are' 'when' 'not' 'its' 'them'
 'what' 'up' 'the' 'be' 'people' 'as' 'must' 'work' 'they' 're' 'that'
 'afghan' 'but' 'forces' 'to' 'by' 'after' 'or' 'at' 'their' 'make' 'can'
 'these' 'government' 'this

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

### Tokenized into Words

The documentation for Top2Vec states that you don't need to remove stopwords or perform any transformation beyond the tokenization. However, as we can see in the **topic_words**, there are an awful amount of useless words appearing in the topics. We'll run the model again but this time tokenize our documents variable into words, and then perform some pre-processing to remove stopwords and words less than 3 characters in length.

Tokenize **documents** list into list of word strings

In [14]:
word_tokens = ' '.join(documents).split()

In [16]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [24]:
word_docs = []
for word in word_tokens:
    if word not in stop_words and len(word) >= 3:
        word_docs.append(word)
print(word_docs)



In [25]:
print(len(word_tokens))
print(len(word_docs))

43092
23563


As we can see, we've reduced the number of words in our documents by about 40%. We'll re-run the model using this new documents object and see how it performs. We may have to tune the parameters accordingly as well.

In [47]:
model = Top2Vec(documents=word_docs, speed='deep-learn', workers=4)

2023-09-16 15:41:07,906 - top2vec - INFO - Pre-processing documents for training


2023-09-16 15:41:08,161 - top2vec - INFO - Creating joint document/word embedding
2023-09-16 15:45:57,922 - top2vec - INFO - Creating lower dimension embedding of documents
2023-09-16 15:46:12,986 - top2vec - INFO - Finding dense areas of documents
2023-09-16 15:46:17,054 - top2vec - INFO - Finding topics


In [48]:
print(f"Number of topics: {model.get_num_topics()}")

Number of topics: 31


In [50]:
topic_words, word_scores, topic_nums = model.get_topics(31)

In [51]:
[print(num, each, '\n') for num, each in zip(topic_nums, topic_words)]

0 ['we' 'they' 'our' 'and' 'that' 'support' 'people' 'military' 'must' 'the'
 'help' 'mission' 'iran' 'iraq' 'new' 'but' 'afghan' 'world' 'continue'
 'america' 'would' 'war' 'united' 'years' 'many' 'make' 'every' 'freedom'
 'afghanistan' 'americans' 'country' 'men' 'american' 'nations' 'women'
 'nation' 'clear' 'states' 'it' 'time' 'democracy' 'middle' 'security'
 'region' 'iraqi' 'terrorists' 'iraqis' 'last' 'government' 'peace'] 

1 ['we' 'they' 'our' 'that' 'the' 'time' 'last' 'security' 'must' 'people'
 'nations' 'united' 'and' 'military' 'iraq' 'new' 'freedom' 'support'
 'afghanistan' 'americans' 'world' 'years' 'country' 'iraqi' 'america'
 'also' 'many' 'iraqis' 'men' 'women' 'region' 'afghan' 'make' 'nation'
 'clear' 'peace' 'iran' 'it' 'qaeda' 'help' 'end' 'mission' 'middle'
 'american' 'democracy' 'would' 'states' 'terrorists' 'one' 'forces'] 

2 ['ve' 're' 'east' 'troops' 'one' 'this' 'qaeda' 'forces' 'government'
 'work' 'end' 'peace' 'it' 'also' 'terrorists' 'middle' 'democ

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

### Label Topics 

We'll now inspect the top 10 scored documents (sentences) in our documents object, in order to try and attribute labels to the topics. Of course, all of the speeches are on Middle Eastern conflict and Security, so we are expecting some overlap. Hopefully though we can still find some useful topics have been returned by the model. 

In [154]:
topic_labels = {0:'Regime change', 1:'Military Support', 2:'Fighting Al Qaeda', 3:'Democracy Promotion', 4:'', 5:'', 6:'',
                7:'Bolstering Iraqi Security Forces', 8:'',9:'', 10:'Al Qaeda Downturn', 11:'',12:'Middle East Security',
                13:'',14:'Freedom', 15:'',16:'',17:'Al Qaeda',18:'Middle East Terrorism Threat', 19:'',20:'',21:'',
                22:'', 23:'',24:'Religious Liberty'}

In [233]:
docs, document_scores, document_ids = model.search_documents_by_topic(topic_num=24, num_docs=10)

In [234]:
for doc, score, doc_id in zip(docs, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

Document: 1630, Score: 0.9612414836883545
-----------
Successful societies guarantee religious liberty -- the right to serve and honor God without fear of persecution.
-----------

Document: 1593, Score: 0.9551823735237122
-----------
The Taliban promised religious purity and national pride.
-----------

Document: 1363, Score: 0.9515417814254761
-----------
a policy basedon blind hope and ideology instead of fact and reality.
-----------

Document: 1099, Score: 0.9419222474098206
-----------
Politics and human rights; economic reform.
-----------

Document: 771, Score: 0.9342107772827148
-----------
In today’s conversation, Prime Minister Maliki and I agreed that a meeting of the Higher Coordinating Committee of the Strategic Framework Agreement will convene in the coming weeks.
-----------

Document: 840, Score: 0.8824529051780701
-----------
And next May, in Chicago, we will host a summit with our NATO allies and partners to shape the next phase of this transition.
-----------

Docum

In [155]:
model.save("US_Speech_Top2Vec_Model")

### Explore Vectors of Selected Topics

In [182]:
model.topic_vectors

array([[-0.0275561 , -0.03603238,  0.03253402, ..., -0.07307215,
         0.05021439, -0.09991232],
       [-0.01931099, -0.00019172,  0.03184908, ...,  0.00018207,
         0.02568965, -0.07075287],
       [-0.02031641,  0.0253384 ,  0.03788157, ...,  0.03014073,
         0.04073928, -0.07761647],
       ...,
       [-0.05756361, -0.04794477, -0.00302538, ...,  0.15022047,
         0.02456175,  0.11949087],
       [ 0.03185358,  0.01691404, -0.0224064 , ...,  0.076126  ,
        -0.07556758,  0.04135645],
       [-0.02212702, -0.05136221,  0.02801345, ...,  0.00290943,
         0.05552179,  0.10939284]], dtype=float32)

In [184]:
topic_words, word_scores, topic_nums = model.get_topics(25)

In [194]:
import numpy as np

In [219]:
documents = df_to_list(pd.read_pickle("speech_documents.pkl"))

In [207]:
model.document_vectors[50]

array([ 2.63999440e-02,  1.02835082e-01, -7.90518709e-03, -5.77824051e-03,
        1.74285739e-03,  5.06902188e-02, -6.75052106e-02,  1.09504327e-01,
       -9.53719858e-03, -5.33011854e-02,  6.48470223e-03,  1.43550009e-01,
       -9.43158343e-02, -3.51412557e-02,  2.75663398e-02, -2.83158533e-02,
        4.14620638e-02, -4.67263423e-02,  1.01560410e-02, -2.41200998e-02,
       -6.08143732e-02,  3.54755856e-03,  6.85398802e-02, -3.44845131e-02,
       -5.44300936e-02, -7.64413103e-02, -5.97617999e-02,  7.98732564e-02,
       -7.06251189e-02, -6.29729107e-02,  3.07316985e-02, -8.02189931e-02,
       -7.21535906e-02,  6.02092221e-02,  1.12568187e-02,  1.90184042e-02,
        7.64991418e-02, -3.63779366e-02, -8.40827003e-02,  3.35457325e-02,
        8.92363340e-02, -9.20279622e-02, -3.56135145e-02, -8.89193714e-02,
        2.06674598e-02, -7.23130181e-02,  1.02013303e-02, -2.26229858e-02,
       -4.02518027e-02, -1.07738962e-02,  4.88193668e-02, -8.03536270e-03,
        6.56142011e-02,  

In [206]:
documents[0]

'We look forward to working with all who embrace genuine and inclusive democracy.'

Create a dictionary where each **key:value** pair is a document_id from our documents object and the value is a tuple consisting of the document (speech sentence) and an average of the vector array for the document. This will allow us to assess the proximity of each document to each topic vector.

In [222]:
my_dict = {}
for id, doc, array in zip(model.document_ids, documents, model.document_vectors):
    my_dict[id] = (doc, np.mean(array))  

In [223]:
my_dict

{0: ('Good afternoon.', -0.0032228532),
 1: ('Earlier today, I was briefed by our senior military and national security leaders on the status of the drawdown of U.S. forces and allied forces in Afghanistan.',
  -0.005607442),
 2: ('When I announced our drawdown in April, I said we would be out by September, and we’re on track to meet that target.',
  0.0012825995),
 3: ('Our military mission in Afghanistan will conclude on August 31st.',
  -0.00066184887),
 4: ('The drawdown is proceeding in a secure and orderly way, prioritizing the safety of our troops as they depart.',
  -0.0019012872),
 5: ('Our military commanders advised me that once I made the decision to end the war, we needed to move swiftly to conduct the main elements of the drawdown.',
  -0.00093411305),
 6: ('And in this context, speed is safety.', 0.0018695343),
 7: ('And thanks to the way in which we have managed our withdrawal, no one—no one U.S. forces or any forces have—have been lost.',
  0.0031365966),
 8: ('Conduct

In [227]:
print(len(model.topic_vectors))

25


In [232]:
for num, array in zip(range(0, 25), model.topic_vectors):
    print(num, np.mean(array))

0 -0.00941972
1 0.0011039017
2 -0.0031656679
3 -0.007956409
4 -0.002700576
5 -3.8446983e-06
6 0.0026560351
7 -0.0046274355
8 -0.0035376092
9 -0.009719916
10 0.0012907777
11 -0.0055226125
12 3.0223131e-05
13 0.0050038537
14 -0.0024638383
15 0.0069398847
16 -0.0022659623
17 0.0021853952
18 -0.007924185
19 -0.0021919655
20 -0.0005215232
21 0.008873677
22 0.004202013
23 0.007145505
24 0.0038838168


In [10]:
# these are our selected topics which I have interpreted. In total 11/24 topics can be summarized into a sensible topic label. We'll use these to assess our documents against
topic_labels = {0:('Regime change', -0.00941972), 1:('Military Support', 0.0011039017), 2:('Fighting Al Qaeda', -0.0031656679), 3:('Democracy Promotion', -0.007956409),
                7:('Bolstering Iraqi Security Forces', -0.0046274355), 10:('Al Qaeda Downturn', 0.0012907777), 12:('Middle East Security', 3.0223131e-05),
                14:('Freedom', -0.0024638383), 17:('Al Qaeda',0.0021853952), 18:('Middle East Terrorism Threat', -0.007924185), 
                24:('Religious Liberty',0.0038838168)}

## Document:Speech Vector Mapping

Now we need to map each document (speech sentence) back to it's original speech. This will be in the form of a dataframe. Eventually our DataFrame object will consist of the following features:
* document_id (index)
* document (sentence in speech as tokenized)
* name of speech
* name of president
* date of speech (01/09/2021)
* similarity score to each topic.....

**Cosine Similarity** measures the cosine of the angle between two vectors. This will allow us to assess the similartiy.

In order for us to calculate cosine similarity between two vectors, they should have the same **dimensionality**, meaning they should be of equal size.

### Check Dimensionality of Topic and Document Vectors

In [236]:
len(model.topic_vectors[0])

300

In [240]:
len(model.topic_vectors)

25

In [241]:
mylist = []
for vec in model.topic_vectors:
    if len(vec) == 300:
        mylist.append(1)
sum(mylist)

25

So as we can see, each of our topic vectors have a length of 300. If any of of our document vectors are not of (300,) dimensionality then we'll need to transform them by padding them out with zeros so they have the same dimensionality as our topic vectors.

In [244]:
mylist = []
for array in model.document_vectors:
    if len(array) == 300:
        mylist.append(1)
sum(mylist)

2293

It looks like Top2Vec produces document vectors and topic vectors of equal size! This saves us the trouble of reducing dimensioality or padding - happy days!

### Find Cosine Similarity between document and topic

In [6]:
model = Top2Vec.load("US_Speech_Top2Vec_Model")

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
document_vectors = {}
for id, vec in zip(model.document_ids, model.document_vectors):
    document_vectors[id] = vec

In [11]:
topic_vectors = {}
for topic_id, topic,vec in zip( 
    list(topic_labels.keys()), # selected topic ids , 
    [i[0] for i in list(topic_labels.values())], # topic labels for selected topics
    model.topic_vectors):
    topic_vectors[topic_id] = (topic, vec) 

In [12]:
# create a dataframe to store proximity scores
df = pd.DataFrame(index=document_vectors.keys(), columns=[i[0] for i in topic_vectors.values()])

In [13]:
# Calculate and fill in the proximity scores
for doc_id, doc_vector in document_vectors.items():
    for topic_name, topic_vector in topic_vectors.values():
        # calculate cosine similarity between document and topic vectors
        similarity_score = cosine_similarity([doc_vector], [topic_vector])[0][0]
        df.at[doc_id, topic_name] = similarity_score

In [14]:
df.head()

Unnamed: 0,Regime change,Military Support,Fighting Al Qaeda,Democracy Promotion,Bolstering Iraqi Security Forces,Al Qaeda Downturn,Middle East Security,Freedom,Al Qaeda,Middle East Terrorism Threat,Religious Liberty
0,0.091866,0.020608,-0.013549,0.012241,0.129675,0.121716,-0.019216,0.057875,-0.051857,0.021145,0.035125
1,0.462341,0.170909,0.502308,0.706292,0.296541,0.224687,-0.120064,0.599147,-0.025133,0.397244,0.183778
2,0.164941,0.084289,0.384271,0.428225,0.433138,0.283634,0.486977,0.124019,0.236934,0.134657,0.774891
3,0.294849,0.466644,0.720763,0.41306,0.481794,-0.027502,0.32805,0.036519,0.448192,0.367726,0.666796
4,0.166568,0.59265,0.154344,-0.28015,0.251566,0.446406,-0.209066,0.491736,-0.196509,0.079829,-0.286824


In [15]:
df.tail()

Unnamed: 0,Regime change,Military Support,Fighting Al Qaeda,Democracy Promotion,Bolstering Iraqi Security Forces,Al Qaeda Downturn,Middle East Security,Freedom,Al Qaeda,Middle East Terrorism Threat,Religious Liberty
2288,-0.219191,0.230108,0.315848,-0.079737,0.105131,-0.225765,0.63331,-0.37586,0.645601,0.112849,0.605423
2289,0.306164,-0.045996,-0.165452,-0.206985,0.309835,-0.464891,-0.082501,-0.556862,-0.061615,-0.129418,-0.189807
2290,-0.002578,0.268779,0.202175,-0.091764,0.431797,0.178318,-0.245486,-0.216041,-0.225712,-0.196265,0.278301
2291,0.391975,-0.124452,-0.072403,0.098981,0.151401,-0.296112,0.194545,-0.159157,-0.035106,-0.047559,-0.172518
2292,0.207835,-0.087766,-0.082306,-0.089231,0.196359,-0.583278,-0.252581,-0.665902,-0.116711,-0.177803,-0.188126


Awesome! Our analysis is really starting to take shape. Let's check that there are no missing values, noisy data or incorrect data types, etc. 

In [16]:
df.describe()

Unnamed: 0,Regime change,Military Support,Fighting Al Qaeda,Democracy Promotion,Bolstering Iraqi Security Forces,Al Qaeda Downturn,Middle East Security,Freedom,Al Qaeda,Middle East Terrorism Threat,Religious Liberty
count,2293.0,2293.0,2293.0,2293.0,2293.0,2293.0,2293.0,2293.0,2293.0,2293.0,2293.0
unique,2293.0,2293.0,2293.0,2293.0,2293.0,2293.0,2293.0,2293.0,2293.0,2293.0,2293.0
top,0.091866,0.020608,-0.013549,0.012241,0.129675,0.121716,-0.019216,0.057875,-0.051857,0.021145,0.035125
freq,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
df.isnull().sum()

Regime change                       0
Military Support                    0
Fighting Al Qaeda                   0
Democracy Promotion                 0
Bolstering Iraqi Security Forces    0
Al Qaeda Downturn                   0
Middle East Security                0
Freedom                             0
Al Qaeda                            0
Middle East Terrorism Threat        0
Religious Liberty                   0
dtype: int64