# Topic modelling using BERTopic

## Libraries/data required

In [78]:
# IMPORTS
from bertopic import BERTopic
import pandas as pd
import os

In [79]:
# Read the data and perform preprocessing

df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"]) # Read data into 'df' dataframe
print(df.shape) # Print dataframe shape

docs = df["summary"].tolist() # Create a list containing all article summaries

df.head() # Show first 5 dataframe entries

(18520, 5)


Unnamed: 0,summary,date,location_article,lat,lng
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.57125
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.57125


## Fitting BERTopic

This might take a while on a CPU. In the background a pre-trained Large Language Model, called the sentence embedder, is used to convert the articles to a semantic vector space. We then perform clustering in this space.

In [80]:
if os.path.exists('southsudan_model'):
    bertopic = BERTopic.load('southsudan_model')
else:
    bertopic = BERTopic(language="english", calculate_probabilities=True, verbose=True) # Initialize the BERTopic model

    bertopic.fit_transform(docs) # Fit the model to the list of article summaries
    bertopic.save("southsudan_model") # Save the trained model as "southsudan_model"

In [81]:
#Due to the modularity of the model, there is a lot of randomness that hinders reproducibiity of the model.
#To fight this, you can for example set random state in the dimensionality reduction step via the following lines 
#or explore a different approach

#from bertopic import BERTopic
#from umap import UMAP

#umap_model = UMAP(n_neighbors=15, n_components=5, 
#                  min_dist=0.0, metric='cosine', random_state=42)
#topic_model = BERTopic(umap_model=umap_model)

## Interactive visualization of the vector space

As you can see, documents with related topics are close in the space.

In [82]:
# We don't need this visualization

# bertopic.visualize_documents(docs) # Create a plot of the topics, this may take a while

In [83]:
# Define a list with keywords that we use later on in the notebook

# List that is being used to check for articles, separately for each keyword in the list
list_keywords = ["hunger", 'refugees', 'conflict', 'humanitarian']

# List that is being used to check for articles that have all keywords in the list in common - Multiple lists can be tested at once!
multiple_keywords = [["hunger", "refugees"], ['conflict', "humanitarian"]]

### Creating smaller topics

Within our list of topics, we find topics that are semantically closest to the list of keywords as defined above.

**Feel free to change this approach!**

In [84]:
# We create a function to calculate a list of the top n topics related to (a) given keyword(s)

def get_relevant_topics(bertopic_model, keywords, top_n):
    '''
    Retrieve a list of the top n number of relevant topics to the provided (list of) keyword(s)
    
    
    Parameters:
        bertopic_model: a (fitted) BERTopic model object
        
        keywords:   a string containing one or multiple keywords to match against,
                    
                    This can also be a list in the form of ['keyword(s)', keyword(s), ...]
                    
                    In this case a maximum of top_n topics will be found per list element 
                    and subsetted to the top_n most relevant topics.
                    
                    !!!
                    Take care that this method only considers the relevancy per inputted keyword(s) 
                    and not the relevancy to the combined list of keywords.
                    
                    In other words, topics that appear in the output might be significantly related to a 
                    particular element in the list of keywords but not so to any other element, 
                    
                    while topics that do not appear in the output might be significantly related to the 
                    combined list of keywords but not much to any of the keyword(s) in particular.
                    !!!
                    
        top_n: an integer indicating the number of desired relevant topics to be retrieved
        
        
        Return: a list of the top_n (or less) topics most relevant to the (list of) provided keyword(s)
    '''
    
    if type(keywords) is str: keywords = [keywords] # If a single string is provided convert it to list type
    
    relevant_topics = list() # Initilize an empty list of relevant topics
    
    for keyword in keywords: # Iterate through list of keywords
        
        # Find the top n number of topics related to the current keyword(s)
        topics = bertopic_model.find_topics(keyword, top_n = top_n)
        
        # Add the topics to the list of relevant topics in the form of (topic_id, relevancy)
        relevant_topics.extend(
            zip(topics[0], topics[1]) # topics[0] = topic_id, topics[1] = relevancy
        )
    
    
    relevant_topics.sort(key=lambda x: x[1]) # Sort the list of topics on ASCENDING ORDER of relevancy
    
    # Get a list of the set of unique topics (with greates relevancy in case of duplicate topics)
    relevant_topics = list(dict(relevant_topics).items())
    
    
    relevant_topics.sort(key=lambda x: x[1], reverse=True) # Now sort the list of topics on DESCENDING ORDER of relevancy
    
    return relevant_topics[:10] # Return a list of the top_n unique relevant topics

### Putting the function to use

The next two code cells retrieve articles based on the defined lists above.
The first cell will create a dataframe which gathers a top 10 of all articles that match the keyword. It will do this for all keywords separately.
The second cell will do the same but then you have the possibility to check for multiple keywords.

In [85]:
# Automatic process!
# Create a dataframe that stores the top 10 articles that relate to each keyword.
for keyword in list_keywords:
    relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=keyword, top_n=10)

    topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

    for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
        print(topic_id, relevancy)

    df[keyword] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

    # View the Count, Name, Representation, and Representative Docs for the relevant topics
    bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

df.head()

16 0.5067806
113 0.45359278
100 0.42132196
81 0.39846724
241 0.3515425
21 0.33424088
147 0.32523632
201 0.3129242
37 0.28182873
240 0.26837444
17 0.68633187
14 0.6556574
245 0.64660656
140 0.6304334
151 0.60843873
29 0.57537127
198 0.5341381
72 0.52255523
68 0.51275575
133 0.5122601
247 0.45804715
66 0.4092132
238 0.38586372
253 0.37990516
0 0.37799144
229 0.375211
30 0.37496358
15 0.37439907
65 0.37156507
149 0.36643463
72 0.6509177
216 0.6434834
37 0.6355741
177 0.6115271
48 0.6110923
29 0.6021605
140 0.6011702
232 0.5888386
198 0.5865686
91 0.5836561


Unnamed: 0,summary,date,location_article,lat,lng,hunger,refugees,conflict,humanitarian
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.57125,False,False,False,False
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396,False,False,False,False
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824,False,False,False,False
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979,False,False,False,False
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.57125,False,False,False,False


In [86]:
# Get the top 10 topics related to all the keywords, per sub list.
# Only a dataframe of the last sublist will be printed.
for sublist in multiple_keywords:
    relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=sublist, top_n=10)

    name = ", ".join(sublist)
    print(name)

    topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

    for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
        print(topic_id, relevancy)

    df[name] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

hunger, refugees
17 0.68633187
14 0.6556574
245 0.64660656
140 0.6304334
151 0.60843873
29 0.57537127
198 0.5341381
72 0.52255523
68 0.51275575
133 0.5122601
conflict, humanitarian
72 0.6509177
216 0.6434834
37 0.6355741
177 0.6115271
48 0.6110923
29 0.6021605
140 0.6011702
232 0.5888386
198 0.5865686
91 0.5836561


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
72,50,72_humanitarian_pibor_jonglei_aid,"[humanitarian, pibor, jonglei, aid, affected, ...",[The article discusses an emergency meeting ca...
216,14,216_lanzer_toby_coordinator_humanitarian,"[lanzer, toby, coordinator, humanitarian, mr, ...",[The article discusses a press briefing with t...
37,79,37_million_aid_humanitarian_funding,"[million, aid, humanitarian, funding, billion,...",[The article discusses the $600 million aid pl...
177,19,177_european_eu_million_commissions,"[european, eu, million, commissions, humanitar...",[The article discusses the ongoing armed confl...
48,64,48_workers_aid_humanitarian_worker,"[workers, aid, humanitarian, worker, maban, ki...",[The article discusses the disappearance of si...
29,91,29_displaced_idps_people_internally,"[displaced, idps, people, internally, malakal,...",[The article discusses the high number of inte...
140,26,140_refugees_unhcr_funding_refugee,"[refugees, unhcr, funding, refugee, unhcrs, hu...",[The article discusses the appeal made by UNHC...
232,12,232_ukraine_canadian_peace_aid,"[ukraine, canadian, peace, aid, overshadowed, ...",[The article discusses the commitment of warri...
198,17,198_civilians_un_unmiss_bases,"[civilians, un, unmiss, bases, displaced, refu...",[The article discusses new fighting in South S...
91,41,91_red_cross_icrc_crescent,"[red, cross, icrc, crescent, ifrc, medical, ca...",[The article discusses how the Governor of Jon...


In [87]:
# Show the dataframe with all the keywords and multiple keywords combination
df

Unnamed: 0,summary,date,location_article,lat,lng,hunger,refugees,conflict,humanitarian,"hunger, refugees","conflict, humanitarian"
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.571250,False,False,False,False,False,False
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396,False,False,False,False,False,False
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824,False,False,False,False,False,False
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979,False,False,False,False,False,False
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.571250,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
18515,The article discusses the successful delivery ...,2023-04-26,Maiwut Primary Health Care Centre,8.606200,33.924100,False,False,False,False,False,False
18516,The article discusses the bombing and forced e...,2023-04-26,Khartoum,15.500654,32.559899,False,False,False,False,False,False
18517,The article discusses how Prime Minister Abiy ...,2023-04-23,Addis Ababa,8.980603,38.757761,False,False,False,False,False,False
18518,The article discusses the collapse of a commer...,2023-04-17,Kampala International University,0.294360,32.603970,False,False,False,False,False,False


In [88]:
original_df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"])

# Combine article summaries with the newly created features
df = original_df.merge(
    df[["summary"] + list_keywords],
    how="left",
    left_on="summary",
    right_on="summary",
)

df.to_csv("data/articles_topics.csv", index=False) # Save DataFrame to articles_topics.csv

df.head()

Unnamed: 0,summary,date,location_article,lat,lng,hunger,refugees,conflict,humanitarian
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.57125,False,False,False,False
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396,False,False,False,False
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824,False,False,False,False
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979,False,False,False,False
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.57125,False,False,False,False


In [89]:
# Check if each column in drop_columns has False value in any row
bool_mask = df[list_keywords].apply(lambda row: all(row == False), axis=1)

# Drop rows where any column in drop_columns has False value
filtered_df = df[bool_mask]

# Print length of original dataframe and filtered dataframe with rows that are not categorized
print(str(len(df)) + " total articles")
print(str(len(filtered_df)) + " articles do not have any relation to any of the keywords")

# manual check
# print(len(df[(df["hunger"]==False) & (df["refugees"] == False) & (df["humanitarian"] == False) & (df["conflict"] == False)]))


18520 total articles
16473 articles do not have any relation to any of the keywords
