# Topic modelling using BERTopic

## Libraries/data required

In [1]:
# IMPORTS
from bertopic import BERTopic
from umap import UMAP
import pandas as pd
import os
from rake_nltk import Rake
from tqdm.notebook import tqdm
import nltk
import yake
from ipywidgets import FloatProgress
from collections import Counter
import itertools
import ast
import re

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
# Read the data and perform preprocessing
df = pd.read_csv("dataset_with_keywords.csv", parse_dates=["date"]) # Read data into 'df' dataframe
docs = df["summary"].tolist() # Create a list containing all article summaries

## Fitting BERTopic

This might take a while on a CPU. In the background a pre-trained Large Language Model, called the sentence embedder, is used to convert the articles to a semantic vector space. We then perform clustering in this space.

In [3]:
if os.path.exists('southsudan_model'):
    bertopic = BERTopic.load('southsudan_model')
else:
    #Due to the modularity of the model, there is a lot of randomness that hinders reproducibiity of the model.
    #To fight this, you can for example set random state in the dimensionality reduction step via the following lines 
    #or explore a different approach

    umap_model = UMAP(n_neighbors=15, n_components=5, 
                 min_dist=0.0, metric='cosine', random_state=42)
    bertopic = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=umap_model) # Initialize the BERTopic model
    bertopic.fit_transform(docs) # Fit the model to the list of article summaries
    bertopic.save("southsudan_model") # Save the trained model as "southsudan_model"

 # Helper functions

In [4]:
def yake_keyword(dataframe):
    """Applies the yake library to a dataframe. Yake is a library that applies keyword extraction.
    
    Input: 
    - dataframe: A dataframe consisting out of a column with text that needs keyword extraction

    Output:
    - dataframe: Dataframe containing 2 extra columns (paragraph & summary) with the keywords determined by yake.
    
    """
    # Implement a progress bar in the cell to show the progress.
    tqdm.pandas()
    # Apply the keyword extractor function from the NLP library yake.
    language = 'en'
    max_ngram_size = 2
    deduplication_threshold = 0.9
    numOfKeywords = 3  # <- Multiple keywords
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size,
                                                dedupLim=deduplication_threshold,
                                                top=numOfKeywords, features=None)

    extractor = lambda x: custom_kw_extractor.extract_keywords(x)
    # Apply the keyword extraction on both the summaries and the whole article content
    df['paragraphs_3_keywords_2gram_summary'] = df['summary'].progress_apply(extractor)
    df['keywords_paragraphs'] = df['paragraphs'].progress_apply(extractor)
    return

In [6]:
def rake_extractor(text):
    """ Determines the keywords."""
    r = Rake()
    nltk.download('stopwords')
    nltk.download('punkt')
    r.extract_keywords_from_text(text)
    return list(r.get_word_degrees().keys())

def rake_keywords(dataframe):
        """Applies the rake library to a dataframe. Rake is a library that applies keyword extraction.
    
    Input: 
    - dataframe: A dataframe consisting out of a column with text that needs keyword extraction

    Output:
    - dataframe: Dataframe containing 2 extra columns (paragraph & summary) with the keywords determined by rake.
    
    """
    # Apply the extractor function with a progress bar to the 'summary' column
    tqdm.pandas(desc="Extracting Keywords from 'summary'")
    dataframe['summary_rake_keywords'] = dataframe['summary'].progress_apply(rake_extractor)

    # Apply the extractor function with a progress bar to the 'paragraphs' column
    tqdm.pandas(desc="Extracting Keywords from 'paragraphs'")
    dataframe['paragraphs_rake_keywords'] = dataframe['paragraphs'].progress_apply(rake_extractor)
    return

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 21)

In [15]:
# We create a function to calculate a list of the top n topics related to (a) given keyword(s)

def get_relevant_topics(bertopic_model, keywords, top_n):
    '''
    Retrieve a list of the top n number of relevant topics to the provided (list of) keyword(s)
    
    
    Parameters:
        bertopic_model: a (fitted) BERTopic model object
        
        keywords:   a string containing one or multiple keywords to match against,
                    
                    This can also be a list in the form of ['keyword(s)', keyword(s), ...]
                    
                    In this case a maximum of top_n topics will be found per list element 
                    and subsetted to the top_n most relevant topics.
                    
                    !!!
                    Take care that this method only considers the relevancy per inputted keyword(s) 
                    and not the relevancy to the combined list of keywords.
                    
                    In other words, topics that appear in the output might be significantly related to a 
                    particular element in the list of keywords but not so to any other element, 
                    
                    while topics that do not appear in the output might be significantly related to the 
                    combined list of keywords but not much to any of the keyword(s) in particular.
                    !!!
                    
        top_n: an integer indicating the number of desired relevant topics to be retrieved
        
        
        Return: a list of the top_n (or less) topics most relevant to the (list of) provided keyword(s)
    '''
    
    if type(keywords) is str: keywords = [keywords] # If a single string is provided convert it to list type
    
    relevant_topics = list() # Initilize an empty list of relevant topics
    
    for keyword in keywords: # Iterate through list of keywords
        
        # Find the top n number of topics related to the current keyword(s)
        topics = bertopic_model.find_topics(keyword, top_n = top_n)
        
        # Add the topics to the list of relevant topics in the form of (topic_id, relevancy)
        relevant_topics.extend(
            zip(topics[0], topics[1]) # topics[0] = topic_id, topics[1] = relevancy
        )
    
    
    relevant_topics.sort(key=lambda x: x[1]) # Sort the list of topics on ASCENDING ORDER of relevancy
    
    # Get a list of the set of unique topics (with greates relevancy in case of duplicate topics)
    relevant_topics = list(dict(relevant_topics).items())
    
    
    relevant_topics.sort(key=lambda x: x[1], reverse=True) # Now sort the list of topics on DESCENDING ORDER of relevancy
    
    return relevant_topics[:10] # Return a list of the top_n unique relevant topics

In [None]:
# Apply rake and yake to a dataframe and store it.

# df = rake_keywords(df)
# df = yake_keyword(df)
# df.to_csv('dataset_with_keywords.csv')

# Run from here if dataset already contains keywords from yake and rake!
## Determination of keywords

Top Rake keywords with the summarized articles

In [8]:
# --------------- You can only run this line once after you initialized df as the dataframe. --------------
df['summary_rake_keywords'] = df['summary_rake_keywords'].apply(ast.literal_eval)
# ----------------------------------------------------------------------------------------------------------

# Iterate through the DataFrame and print each item in the lists
keywords_list = []
for index, row in df.iterrows():
    for item in row['summary_rake_keywords']:
        keywords_list.append(item)

# Count the keywords and sort them using the Counter library
counts = Counter(keywords_list)

# Create a dataframe to sort the keywords more easily
keywords_rake_summary = pd.DataFrame.from_dict(counts, orient='index').reset_index()
keywords_rake_summary.rename( columns={0 :'values'}, inplace=True )
keywords_rake_summary.sort_values(by='values', ascending=False)

Unnamed: 0,index,values
0,article,18500
1,discusses,18499
6,south,16898
7,sudan,16534
151,government,6593
...,...,...
16888,sympathize,1
16889,predictions,1
16891,destabilized,1
16895,furthering,1


Top rake keywords with the paragraphs from the articles

In [9]:
# RAKE paragraphs
# Iterate through the DataFrame and print each item in the lists

# --------------- You can only run this line once after you initialized df as the dataframe. --------------
df['paragraphs_rake_keywords'] = df['paragraphs_rake_keywords'].apply(ast.literal_eval)
# ----------------------------------------------------------------------------------------------------------

keywords_list = []
for index, row in df.iterrows():
    for item in row['paragraphs_rake_keywords']:
        keywords_list.append(item)

# Count the keywords and sort them using the Counter library
counts = Counter(keywords_list)

# Create a dataframe to sort the keywords more easily
keywords_rake_paragraphs = pd.DataFrame.from_dict(counts, orient='index').reset_index()
keywords_rake_paragraphs.rename( columns={0 :'values'}, inplace=True )
keywords_rake_paragraphs.sort_values(by='values', ascending=False)

Unnamed: 0,index,values
4,sudan,17924
3,south,17761
15,said,13722
80,government,12767
51,people,12279
...,...,...
46939,faroqu,1
46937,martials,1
46934,marbles,1
13776,esheraya,1


Top yake keywords from the summary and paragraphs

In [10]:
# Add new columns to the dataframe of the keyword determination with YAKE
df_keywords_summary = pd.DataFrame(df['keywords_summary'])
df_keywords_paragraph = pd.DataFrame(df['keywords_paragraphs'])

# YAKE summary
df_keywords_summary['Extracted_Words'] = df_keywords_summary['keywords_summary'].apply(lambda x: re.search(r'[A-Z][a-z]+', x).group() if re.search(r'[A-Z][a-z]+', x) else "")
df_keywords_summary = pd.DataFrame.from_dict(df_keywords_summary[['Extracted_Words']].value_counts().to_dict(), orient='index').reset_index()
df_keywords_summary.rename(columns={0 :'values'}, inplace=True )

# YAKE paragraphs
# Create a dataframe to sort the keywords more easily
df_keywords_paragraph['Extracted_Words'] = df_keywords_paragraph['keywords_paragraphs'].apply(lambda x: re.search(r'[A-Z][a-z]+', x).group() if re.search(r'[A-Z][a-z]+', x) else "")
df_keywords_paragraph = pd.DataFrame.from_dict(df_keywords_paragraph[['Extracted_Words']].value_counts().to_dict(), orient='index').reset_index()
df_keywords_paragraph.rename(columns={0 :'values'}, inplace=True)

Based on the the top provided by the cells above, 14 words are chosen for this project.
The keywords are listed here below.

In [34]:
# Rake == generalish
chosen_keywords_rake_summary = ['government', 'president', 'peace', 'juba', 'conflict', 'security',
                            'violence', 'international', 'un', 'support', 'humanitarian', 'oil',
                            'war', 'machar']
                          
chosen_keywords_rake_paragraphs = ['government', 'people','juba','president', 'state', 'peace','security',
                                    'international','conflict', 'year', 'national', 'united', 'war', 'political']   

# Yake == location
chosen_keywords_yake_paragraphs = ['juba', 'abyei', 'president', 'machar', 'Uganda', 'jonglei', 'darfur',
                            'nile', 'minister', 'nuer', 'khartoum', 'government', 'police', 'ethiopia',]

chosen_keywords_yake_summary = ['president', 'jonglei','uganda', 'abyei', 'machar', 'united', 'bor', 'ethiopia'
                                 ,'republic', 'ababa', 'bentiu', 'malakal', 'kenya', 'unity']                      

In [35]:
def add_keywords_as_columns_to_dataframe(chosen_words_list_all_possibilities, dataframe):
    """Add the keywords generated by key word extraction method as a column to a dataframe 
    with a value containing True or False depending on wheteher the keyword relates to the article.

    Input:
    - chosen_words_list_all_possibilities: list of keywords

    Output:
    - dataframe
    """
    for item in tqdm(chosen_words_list_all_possibilities):
        print(item)
        # Get the top 10 topics related to the keywords 'hunger' and 'food insecurity'
        relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=item, top_n=10)

        topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

        # for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
        #     print(topic_id, relevancy)

        item = str([item])   
        dataframe[item] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

        # View the Count, Name, Representation, and Representative Docs for the relevant topics
        # bertopic.get_topic_info().set_index('Topic').loc[topic_ids]
    return dataframe

# From keywords to dataframe ready for OLS

In [36]:
# Apply the function to the different dataframes
df_rake_paragraphs = add_keywords_as_columns_to_dataframe(chosen_keywords_rake_paragraphs, df_rake_paragraphs)
df_rake_summary = add_keywords_as_columns_to_dataframe(chosen_keywords_rake_summary, df_rake_summary)
df_yake_paragraphs = add_keywords_as_columns_to_dataframe(chosen_keywords_yake_paragraphs, df_yake_paragraphs)
df_yake_summary = add_keywords_as_columns_to_dataframe(chosen_keywords_yake_summary, df_yake_summary)

  0%|          | 0/14 [00:00<?, ?it/s]

government
people
juba
president
state
peace
security
international
conflict
year
national
united
war
political


  0%|          | 0/14 [00:00<?, ?it/s]

government
president
peace
juba
conflict
security
violence
international
un
support
humanitarian
oil
war
machar


  0%|          | 0/14 [00:00<?, ?it/s]

juba
abyei
president
machar
Uganda
jonglei
darfur
nile
minister
nuer
khartoum
government
police
ethiopia


  0%|          | 0/14 [00:00<?, ?it/s]

president
jonglei
uganda
abyei
machar
united
bor
ethiopia
republic
ababa
bentiu
malakal
kenya
unity


In [37]:
df_rake_paragraphs.columns

Index(['Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'index',
       'name', 'summary', 'date', 'lat', 'lng', 'paragraphs', 'publisher',
       'tags', 'keywords_summary', 'keywords_paragraphs',
       'paragraphs_3_keywords_2gram_summary', 'summary_rake_keywords',
       'paragraphs_rake_keywords', '['government']', '['people']', '['juba']',
       '['president']', '['state']', '['peace']', '['security']',
       '['international']', '['conflict']', '['year']', '['national']',
       '['united']', '['war']', '['political']'],
      dtype='object')

In [38]:
# store the csv's
df_rake_paragraphs.to_csv('df_rake_paragraphs.csv')
df_rake_summary.to_csv('df_rake_summary.csv')
df_yake_paragraphs.to_csv('df_yake_paragraphs.csv')
df_yake_summary.to_csv('df_yake_summary.csv')

### add chatgpt generated keywords

<b>The following Chatgpt interaction was used:</b>

consider the situation of South Sudan. Present 20 singular keywords in a list format that describe the past situation of South Sudan.

ChatGPT

Certainly, here are 20 keywords that describe the past situation of South Sudan:

~

User

The keywords may only consist of a single word. abbreviations of parties are allowed

ChatGPT

Understood, here are 20 single-word keywords and abbreviations that describe the past situation of South Sudan:

~

User

Civil War is not a single word, reconsider the list

ChatGPT

Certainly, here's a revised list of single-word keywords and abbreviations that describe the past situation of South Sudan:

~

User

can you put quotations around the words and a comma in between like a python list format?

ChatGPT

Certainly, here's the list of single-word keywords and abbreviations for the past situation of South Sudan in Python list format:

~

In [None]:
gpt_keywords = ["Independence", "Secession", "Conflict", "Famine", "Peace", "Refugees", "Ethnicity", "UNMISS", "SPLA", "Oil", "Displacement", "Children", "Referendum", "Instability", "Clashes", "Drought", "Corruption", "Mediation", "Crisis", "Rebel"]
df_gpt = add_keywords_as_columns_to_dataframe(gpt_keywords)
df_gpt.to_csv('df_gpt.csv')

NameError: name 'add_keywords_as_columns_to_dataframe' is not defined