# Topic modelling using BERTopic

## Libraries/data required

In [None]:
# Import library
from bertopic import BERTopic
from umap import UMAP
import pandas as pd
import os
from tqdm.notebook import tqdm
from collections import Counter
import re
from matplotlib_venn import venn3
import matplotlib.pyplot as plt
from zipfile import ZipFile

# Import all helper function
from helper_functions import *

In [None]:
path = ('given_data\\merged_summaries_fullarticles.zip') 
# loading the temp.zip and creating a zip object 
with ZipFile(path, 'r') as zObject: 
  
    # Extracting all the members of the zip  
    # into a specific location. 
    zObject.extractall( path="given_data\\") 

Merge the summary with paragraphs

In [None]:
data_dir = ('\\given_data')
end_data_dir = ('\\created_data')
# Read in both summarized and full paragraphs dataframes
df = pd.read_csv(os.getcwd() + data_dir + '\\full_dataset.csv')
df_summary = pd.read_csv(os.getcwd() + data_dir + '\\articles_summary_cleaned.csv')

In [None]:
# Check the shape of the dataframes
print('unique values for each column:')
for i in df.columns:
    print(i,len(df[i].unique()))
print('shape of merged df:', df.shape)

In [None]:
# return the first row of each group
df2 = df.groupby(['id', 'name']).first().reset_index()

df_summary.set_index(df_summary['summary'] + df_summary['location_article'], inplace=True)
# df_summary['key'] = pd.util.hash_pandas_object(df_summary['key'])

df2.set_index(df2['summary'] + df2['name'], inplace=True)
# df2['key'] = pd.util.hash_pandas_object(df2['key'])

# join df_summary and df2 on the index using df2 as master.
df_merged = pd.merge(df2, df_summary, how='inner', left_index=True, right_index=True, suffixes=('', '_remove')).reset_index()

# drop duplicate columns from the join where the values are the same
df_merged.drop([i for i in df_merged.columns if 'remove' in i],
               axis=1, inplace=True)
# drop location_article and key columns as well
df_merged.drop(['location_article', 'id'], axis=1, inplace=True)

# show the first 5 rows of the merged df
df_merged.head()

In [None]:
# save the merged df to a csv file of the name 'corrected_full_dataset.csv'
df_merged.to_csv(os.getcwd() + end_data_dir + '\\corrected_full_dataset.csv', index=False)

Read in the correct dataset

In [None]:
# Read the data and perform preprocessing
df = pd.read_csv(os.getcwd() + end_data_dir + "\\corrected_full_dataset.csv", parse_dates=["date"]) # Read data into 'df' dataframe
docs = df["summary"].tolist() # Create a list containing all article summaries

## Fitting BERTopic

This might take a while on a CPU. In the background a pre-trained Large Language Model, called the sentence embedder, is used to convert the articles to a semantic vector space. We then perform clustering in this space.

In [None]:
if os.path.exists('southsudan_model'):
    bertopic = BERTopic.load('southsudan_model')
else:
    #Due to the modularity of the model, there is a lot of randomness that hinders reproducibiity of the model.
    #To fight this, you can for example set random state in the dimensionality reduction step via the following lines 
    #or explore a different approach

    umap_model = UMAP(n_neighbors=15, n_components=5, 
                 min_dist=0.0, metric='cosine', random_state=42) # Avoid the randomness of the model by setting a random state and umap implementation
    bertopic = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=umap_model) # Initialize the BERTopic model
    bertopic.fit_transform(docs) # Fit the model to the list of article summaries
    bertopic.save("southsudan_model") # Save the trained model as "southsudan_model"

# Apply RAKE and YAKE

In [None]:
# Apply rake and yake to a dataframe and store it.
df = rake_keywords(df)
print('---------------------------YAKE Begins-------------------------')
df = yake_keyword(df)

# save the dataframe as the new dataset
df.to_csv('created_data/dataset_with_keywords.csv')

## Determination of keywords

Get the top 50 Rake keywords with the summarized articles

In [None]:
# Iterate through the DataFrame and print each item in the lists
keywords_list = []
for index, row in df.iterrows():
    for item in row['summary_rake_keywords']:
        keywords_list.append(item)

# Count the keywords and sort them using the Counter library
counts = Counter(keywords_list)

# Create a dataframe to sort the keywords more easily
keywords_rake_summary = pd.DataFrame.from_dict(counts, orient='index').reset_index()
keywords_rake_summary.rename( columns={0 :'values'}, inplace=True )
keywords_rake_summary.sort_values(by='values', ascending=False)

Top rake keywords with the paragraphs from the articles

In [None]:
# RAKE paragraphs
# Iterate through the DataFrame and print each item in the lists
keywords_list = []
for index, row in df.iterrows():
    for item in row['paragraphs_rake_keywords']:
        keywords_list.append(item)

# Count the keywords and sort them using the Counter library
counts = Counter(keywords_list)

# Create a dataframe to sort the keywords more easily
keywords_rake_paragraphs = pd.DataFrame.from_dict(counts, orient='index').reset_index()
keywords_rake_paragraphs.rename( columns={0 :'values'}, inplace=True )
keywords_rake_paragraphs.sort_values(by='values', ascending=False)

In [None]:
test_df = df.copy()

In [None]:
yake_top_keywords_summary(test_df)

In [None]:
yake_top_keywords_paragraph(test_df)

# Chosen keywords

Based on the the top provided by the cells above, 50 words are chosen for each keyword extraction method
The keywords are listed here below.

In [None]:
chosen_keywords_yake_summary_pre2015 = ['South', 'Sudan', 'Juba', 'President', 'Sudanese', 'State', 'Jonglei', 'Bor', 'Kiir',
                                'Abyei', 'Republic', 'United', 'African', 'Ethiopia', 'Ababa', 'Machar', 'Bentiu',
                                'Uganda', 'County', 'Malakal', 'Yau', 'Unity', 'Nile', 'Riek', 'Darfur', 'Lakes',
                                'Kenya', 'Security', 'Wau', 'National', 'Rumbek', 'December', 'Khartoum', 'Nuer',
                                'Salva', 'Hon', 'Africa', 'Ghazal', 'East', 'Western', 'Equatoria', 'General',
                                'Pibor', 'Human', 'Minister', 'Ministry', 'Council', 'University', 'China', 
                                 'Bahr']

chosen_keywords_yake_paragraphs_pre2015 = ['South', 'Sudan', 'Sudanese', 'Kiir', 'Nations', 'United', 'Salva', 'President',
                                  'African', 'Ababa', 'state', 'Addis', 'State', 'Nile', 'Liberation', 'Union',
                                  'Riek', 'Machar', 'Jonglei', 'National', 'Security', 'Council', 'Equatoria',
                                  'Upper', 'People', 'Movement', 'Tribune', 'Bahr', 'Minister', 'Juba', 'Unity',
                                  'Abyei', 'International', 'East', 'Yau', 'Lakes', 'General', 'Africa', 'president',
                                  'Legislative', 'States', 'Mayardit', 'Central', 'County', 'government', 'Western',
                                  'Kordofan', 'Deng', 'Nuer', 'people']
                                  
                                  
chosen_keywords_yake_summary_post2015 = ['South', 'Sudan', 'Juba', 'President', 'Uganda', 'Sudanese', 'Machar',
                                         'African', 'United', 'State', 'Ethiopia', 'Kenya', 'Africa', 'Kiir',
                                         'Khartoum', 'Ababa', 'Minister', 'Unity', 'Rwanda', 'Human', 'Malakal',
                                         'Council', 'Somalia', 'Security', 'East', 'Nile', 'Health', 'Republic',
                                         'Equatoria', 'Abyei', 'June', 'Bentiu', 'County', 'Wau', 'Jonglei', 'Police',
                                         'Darfur', 'July', 'August', 'National', 'Egypt', 'Salva', 'Nigeria', 'Rumbek',
                                         'Kenyan', 'December','November', 'Malong', 'Ugandan', 'Development']

chosen_keywords_yake_paragraphs_post2015 = ['South', 'Sudan', 'Sudanese', 'President', 'Nations', 'United', 'Salva', 'Kiir', 'African', 'Machar',
                                            'Riek', 'Council', 'Security', 'Africa', 'Union', 'State', 'National', 'Minister', 'Nile', 'Addis',
                                            'Liberation', 'Ababa', 'East', 'Foreign', 'Juba', 'government', 'peace', 'States', 'Unity', 'Kenyatta',
                                            'Authority', 'International', 'Affairs', 'People', 'Uganda', 'General', 'Upper', 'Uhuru', 'Prime',
                                            'Development', 'Vice', 'Deputy', 'Movement', 'World', 'state', 'people', 'Transitional', 'Kenya',
                                            'Equatoria', 'Deng']
chosen_keywords_rake_summary_pre2015 = ['south', 'sudan', 'government', 'state', 'president', 'sudanese', 'juba', 'people',
                                        'country', 'conflict', 'peace', 'security', 'kiir', 'including', 'salva', 'international',
                                        'countries', 'oil', 'political', 'un', 'national', 'violence', 'support', 'african', 'forces', 'states',
                                        'new', 'united', 'minister', 'issues', 'ongoing', 'development', 'agreement', 'talks',
                                        'humanitarian', 'machar', 'jonglei', 'unity', 'fighting', 'nile', 'border', 'nations', 'highlights',
                                        'riek', 'civilians', 'khartoum', 'leaders', 'army', 'community', 'former']

chosen_keywords_rake_paragraphs_pre2015 = ['sudan', 'south', 'government', 'people', 'juba', 'country', 'state', 'sudanese', 'president',
                                            'peace', 'kiir', 'security', 'new', 'international', 'salva', 'national', 'states',
                                            'conflict', 'united', 'minister', 'political', 'time', 'countries',
                                            'including', 'support', 'war', 'forces', 'general', 'nations', 'development', 'nation',
                                            'first', 'capital', 'african', 'agreement', 'former', 'fighting', 'world', 'khartoum', 'un',
                                            'community', 'army', 'republic', 'members', 'issues', 'areas', 'independence', 'situation',
                                            'violence', 'splm']
                                            
                                            
chosen_keywords_rake_summary_post2015 = ['south', 'sudan', 'peace', 'government', 'president', 'people', 'country', 'juba', 'sudanese',
                                         'conflict', 'security', 'kiir', 'agreement', 'salva', 'un', 'state', 'war', 'international',
                                         'violence', 'national', 'machar', 'humanitarian', 'million', 'riek', 'countries', 'african',
                                         'political', 'support', 'civil', 'united', 'food', 'uganda', 'unity', 'ongoing', 'leaders',
                                         'states', 'new', 'forces', 'development', 'aid', 'fighting', 'nations', 'vice', 'highlights',
                                         'efforts', 'civilians', 'parties', 'situation', 'crisis', 'council', 'meeting', 'opposition']

chosen_keywords_rake_paragraphs_post2015 = ['sudan', 'south', 'country', 'government', 'people', 'peace', 'president', 'sudanese', 'juba',
                                            'conflict', 'state', 'security', 'kiir', 'war', 'international', 'salva', 'agreement', 'last', 
                                            'national', 'new', 'united', 'machar', 'first', 'riek', 'un', 'countries', 'political', 'african',
                                            'time', 'support', 'million', 'violence', 'nations', 'world', 'according', 'forces', 'civil',
                                            'development', 'fighting', 'end', 'humanitarian', 'former', 'states', 'leaders', 'parties', 'africa',
                                            'minister', 'vice', 'general', 'community']
                                            

chosen_keywords_chatgpt = ["Conflict","Peace","Famine","Politics","Aid","UN","Government","Violence","Refugee","Election","Corruption",
                           "Humanitarian","Economy","Security""Rebel","Independence","Poverty","Oil","Diplomacy","Healthcare","Education",
                           "Militia","Inflation","Crisis","Food","Sanctions","Child","Displacement","Peacekeeping","Resolution","Democracy",
                           "War","Infrastructure","Woman",  "Media",  "Ethnicity",  "Mediation","Coup","Agriculture","Water","Development",
                           "Youth","Judicial","Energy","Childbirth","Conservation","Forest","Mining","Tribes","Injustice"]

chosen_keywords_rake_summary = ['article', 'discusses', 'south',  'sudan', 'government', 'also', 'president', 'juba', 'sudanese','people', 'state', 'peace', 'country', 'conflict', 'security', 'kiir', 'including',
                                'two', 'salva', 'due','international', 'un', 'countries', 'violence', 'national','agreement', 'political', 'african', 'support','000', 'humanitarian', 'forces', 'machar', 'united', 'need',
                                'states', 'oil', 'new', 'ongoing', 'riek', 'development', 'war', 'minister', 'unity', 'million', 'issues', 'fighting', 'nations', 'highlights', 'talks']

chosen_keywords_rake_paragraphs = ['sudan', 'south', 'said', 'government', 'people', 'also', 'juba', 'country', 'president', 'state', 'sudanese', 'two', '—', ',"', 'peace', 'one', 'security', 'kiir',
                                    'last', 'since','would', 'international', 'new', 'conflict','salva', 'year', 'national', 'united', 'war', 'political', 'states', 'countries', 'time', 'including', '."', 'minister', 'many', 'support',
                                    'agreement', 'told','first', 'forces', 'well', 'however', 'nations', 'african', 'development', 'un', 'general', 'former']

chosen_keywords_yake_paragraphs = ['Sudan', 'South', '', 'President', 'Juba', 'Kiir', 'Abyei', 'Machar', 'Sudanese', 'State', 'Bor', 'African', 'Uganda', 
                                   'Jonglei', 'Council', 'Darfur', 'Nile', 'United', 'Minister', 'Nuer', 'Khartoum', 'Government', 'Police', 'Ethiopia', 'Wau',
                                     'Republic', 'Yau', 'Kenya', 'Health', 'Pibor', 'Bentiu', 'Rumbek', 'Ministry', 'University', 'Ababa', 'Malakal', 'Africa', 
                                     'Lakes', 'Unity', 'Bank','Murle', 'National', 'Commission', 'Human', 'Equatoria', 'Security', 'Mission', 'Food', 'Children', 'East']

chosen_keywords_yake_summary = ['Sudan', 'South', '', 'Juba', 'President', 'Sudanese', 'State', 'Jonglei', 'Uganda', 'Kiir', 'Machar', 'United', 'African', 'Bor', 'Abyei', 
                                'Ethiopia', 'Republic', 'Ababa', 'Bentiu', 'Unity','Malakal', 'Kenya', 'County', 'Nile', 'Africa', 'Khartoum', 'Yau', 'Darfur', 'Riek', 'Security', 
                                'Wau', 'National', 'Rumbek', 'Minister', 'Lakes', 'December', 'Human', 'Salva', 'East', 'Equatoria','Council', 'General', 'Nuer', 'Pibor', 'Union',
                                  'Ghazal', 'Ugandan', 'Hon', 'Somalia', 'China']
# TODO: Insert the 2017 words here

In [None]:
# Apply the function to the different dataframes
df_rake_paragraphs = add_keywords_as_columns_to_dataframe(chosen_keywords_rake_paragraphs, bertopic, df)
df_rake_summary = add_keywords_as_columns_to_dataframe(chosen_keywords_rake_summary, bertopic, df)
df_yake_paragraphs = add_keywords_as_columns_to_dataframe(chosen_keywords_yake_paragraphs, bertopic, df)
df_yake_summary = add_keywords_as_columns_to_dataframe(chosen_keywords_yake_summary, bertopic, df)
df_chatgpt = add_keywords_as_columns_to_dataframe(chosen_keywords_chatgpt, bertopic, df)

In [None]:
# store the csv's
df_rake_paragraphs.to_csv('created_data/df_rake_paragraphs.csv')
df_rake_summary.to_csv('created_data/df_rake_summary.csv')
df_yake_paragraphs.to_csv('created_data/df_yake_paragraphs.csv')
df_yake_summary.to_csv('created_data/df_yake_summary.csv')
df_chatgpt.to_csv('created_data/df_chatgpt.csv')

# Keyword analysis and visulizations

In [None]:
chosen_keywords_chatgpt = set([x.lower() for x in chosen_keywords_chatgpt])
chosen_keywords_yake_paragraphs_post2015 = [x.lower() for x in chosen_keywords_yake_paragraphs_post2015]
chosen_keywords_yake_summary_post2015= [x.lower() for x in chosen_keywords_yake_summary_post2015]
chosen_keywords_yake_paragraphs_pre2015= [x.lower() for x in chosen_keywords_yake_paragraphs_pre2015]
chosen_keywords_yake_summary_pre2015= [x.lower() for x in chosen_keywords_yake_summary_pre2015]

In [None]:
all_keywords_yake_pre2015 = chosen_keywords_yake_summary_pre2015 + chosen_keywords_yake_paragraphs_pre2015
all_keywords_yake_post2015 = chosen_keywords_yake_summary_post2015 + chosen_keywords_yake_paragraphs_post2015
all_keywords_rake_pre2015 = chosen_keywords_rake_summary_pre2015 + chosen_keywords_rake_paragraphs_pre2015
all_keywords_rake_post2015 = chosen_keywords_rake_summary_post2015 + chosen_keywords_rake_paragraphs_post2015

# Create subset of all yake and rake keywords
all_keywords_rake = all_keywords_rake_post2015 + all_keywords_rake_pre2015
all_keywords_yake = all_keywords_yake_post2015 + all_keywords_yake_pre2015

# Create a subset for summaries and paragraphs
all_summaries = chosen_keywords_yake_summary + chosen_keywords_rake_summary
all_paragraphs = chosen_keywords_rake_paragraphs + chosen_keywords_yake_paragraphs

# Create a pre and post 2015 subset of both rake and yake keywords
all_pre2015 = all_keywords_rake_pre2015 + all_keywords_yake_pre2015
all_post_2015 = all_keywords_rake_post2015 + all_keywords_yake_post2015

In [None]:
# Create a Venn diagram
venn3([set(all_pre2015), set(all_post_2015), chosen_keywords_chatgpt], ('Pre-2015', 'Post-2015', 'ChatGPT'))

# Display the diagram
plt.title("Overlapping rake and yake")
plt.show()

In [None]:
# Create a Venn diagram
venn3([set(all_summaries), set(all_paragraphs), chosen_keywords_chatgpt], ('Summaries', 'paragraphs', 'ChatGPT'))

# Display the diagram
plt.title("Overlapping Rake and Yake")
plt.show()

# Modelling 2015 - 2017

In [None]:
# Load in the dataframe to make a split
df = pd.read_csv('created_data/corrected_full_dataset.csv', parse_dates=['date'])

In [None]:
# Split the dataset on 2017 to see if the keywords are different.
df_pre2017 = df[df['date'] < '2017-01-01']
df_post2017 = df[df['date'] >= '2017-01-01']
df_pre2015 = df[df['date'] < '2015-01-01']
df_post2015 = df[df['date'] >= '2015-01-01']

# Perform preprocessing
docs_pre2017 = df_pre2017["summary"].tolist() # Create a list containing all article summaries
docs_post2017 = df_post2017["summary"].tolist() # Create a list containing all article summaries
docs = df['summary'].tolist()
# Perform preprocessing
docs_pre2015 = df_pre2015["summary"].tolist() # Create a list containing all article summaries
docs_post2015 = df_post2015["summary"].tolist() # Create a list containing all article summaries
docs = df['summary'].tolist()

Fit Bertopic for 2017 + 2015 models

In [None]:
if os.path.exists('southsudan_modelpre2017'):
    bertopicpre2017 = BERTopic.load('southsudan_modelpre2017')
else:
    #Due to the modularity of the model, there is a lot of randomness that hinders reproducibiity of the model.
    #To fight this, you can for example set random state in the dimensionality reduction step via the following lines 
    #or explore a different approach

    umap_model = UMAP(n_neighbors=15, n_components=5, 
                 min_dist=0.0, metric='cosine', random_state=42)
    bertopicpre2017 = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=umap_model) # Initialize the BERTopic model
    bertopicpre2017.fit_transform(docs_pre2017) # Fit the model to the list of article summaries
    bertopicpre2017.save("southsudan_modelpre2017") # Save the trained model as "southsudan_model"
    
if os.path.exists('southsudan_modelpost2017'):
    bertopicpost2017 = BERTopic.load('southsudan_modelpost2017')
else:
    #Due to the modularity of the model, there is a lot of randomness that hinders reproducibiity of the model.
    #To fight this, you can for example set random state in the dimensionality reduction step via the following lines 
    #or explore a different approach

    umap_model = UMAP(n_neighbors=15, n_components=5, 
                 min_dist=0.0, metric='cosine', random_state=42)
    bertopicpost2017 = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=umap_model) # Initialize the BERTopic model
    bertopicpost2017.fit_transform(docs_post2017) # Fit the model to the list of article summaries
    bertopicpost2017.save("southsudan_modelpost2017") # Save the trained model as "southsudan_model"
    

In [None]:
if os.path.exists('southsudan_modelpre2015'):
    bertopicpre2015 = BERTopic.load('southsudan_modelpre2015')
else:
    #Due to the modularity of the model, there is a lot of randomness that hinders reproducibiity of the model.
    #To fight this, you can for example set random state in the dimensionality reduction step via the following lines 
    #or explore a different approach

    umap_model = UMAP(n_neighbors=15, n_components=5, 
                 min_dist=0.0, metric='cosine', random_state=42)
    bertopicpre2015 = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=umap_model) # Initialize the BERTopic model
    bertopicpre2015.fit_transform(docs_pre2015) # Fit the model to the list of article summaries
    bertopicpre2015.save("southsudan_modelpre2015") # Save the trained model as "southsudan_model"
    
if os.path.exists('southsudan_modelpost2015'):
    bertopicpost2015 = BERTopic.load('southsudan_modelpost2015')
else:
    #Due to the modularity of the model, there is a lot of randomness that hinders reproducibiity of the model.
    #To fight this, you can for example set random state in the dimensionality reduction step via the following lines 
    #or explore a different approach

    umap_model = UMAP(n_neighbors=15, n_components=5, 
                 min_dist=0.0, metric='cosine', random_state=42)
    bertopicpost2015 = BERTopic(language="english", calculate_probabilities=True, verbose=True, umap_model=umap_model) # Initialize the BERTopic model
    bertopicpost2015.fit_transform(docs_post2015) # Fit the model to the list of article summaries
    bertopicpost2015.save("southsudan_modelpost2015") # Save the trained model as "southsudan_model"
    

Apply the Rake and Yake models

In [None]:
 # Implement a progress bar in the cell to show the progress.
tqdm.pandas()
df_pre2017 = rake_keywords(df_pre2017)
df_post2017 = rake_keywords(df_post2017)
df_pre2017 = yake_keyword(df_pre2017)
df_post2017 = yake_keyword(df_post2017)

df_pre2017.to_csv('created_data/dataset_with_keywords_pre2017.csv')
df_post2017.to_csv('created_dataset_with_keywords_post2017.csv')

In [None]:
 # Implement a progress bar in the cell to show the progress.
tqdm.pandas()
df_pre2015 = rake_keywords(df_pre2015)
df_post2015 = rake_keywords(df_post2015)
df_pre2015 = yake_keyword(df_pre2015)
df_post2015 = yake_keyword(df_post2015)

df_pre2015.to_csv('created_data/dataset_with_keywords_pre2015.csv')
df_post2015.to_csv('created_dataset_with_keywords_post2015.csv')

# Determination of the keywords

Rake summary

In [None]:
# Get the 50 'best' keywords
extract_rake_summary_keywords(df_pre2017)
extract_rake_summary_keywords(df_post2017)
extract_rake_summary_keywords(df_pre2015)
extract_rake_summary_keywords(df_post2015)

Rake paragraphs

In [None]:
extract_rake_paragraphs_keywords(df_pre2017)
extract_rake_paragraphs_keywords(df_post2017)
extract_rake_paragraphs_keywords(df_pre2015)
extract_rake_paragraphs_keywords(df_post2015)

YAKE keywords

In [None]:
yake_top_keywords_paragraph(df_pre2017)
yake_top_keywords_summary(df_pre2017)

yake_top_keywords_paragraph(df_post2017)
yake_top_keywords_summary(df_post2017)

yake_top_keywords_paragraph(df_pre2015)
yake_top_keywords_summary(df_pre2015)

yake_top_keywords_paragraph(df_post2015)
yake_top_keywords_summary(df_post2015)

Chosen keywords

In [None]:
chosen_keywords_yake_summary_pre2017 = ['South', 'Sudan', 'Sudanese', 'Nations', 'United', 'Kiir', 'Salva', 'President', 'African', 'Ababa', 'State', 'Addis', 
                                        'state', 'Nile', 'Riek', 'Machar', 'Liberation', 'Union', 'Security', 'Council', 'National', 'Jonglei', 'Equatoria', 'People', 
                                        'Upper', 'Minister', 'Movement', 'Unity', 'Juba', 'East', 'Africa', 'Tribune', 'Bahr', 'International', 'Abyei', 'States', 'Yau', 
                                        'General', 'government', 'Lakes', 'president', 'Central', 'Western', 'Foreign', 'Legislative', 'Deng', 'Mayardit', 'people', 'County', 'Affairs']


chosen_keywords_yake_paragraphs_pre2017 = ['Sudan', 'South', 'Juba', 'President', 'Sudanese', 'State', 'Jonglei',  'Kiir', 'Bor', 'African', 'Republic', 'Machar', 'Abyei',
                                            'United', 'Uganda', 'Ethiopia', 'Ababa','Bentiu', 'Unity', 'Malakal', 'County', 'Nile', 'Yau', 'Darfur', 'Kenya', 'Riek', 
                                            'Security', 'Wau', 'Lakes', 'Rumbek', 'Khartoum', 'National', 'December', 'Africa', 'Nuer', 'Equatoria', 'Human', 'Council',
                                              'East', 'Salva', 'Ghazal', 'Minister','Hon','Western','General','Government','China','Union','Pibor','July']


chosen_keywords_rake_paragraphs_post2017 = ['sudan', 'south', 'government', 'country', 'people', 'peace', 'sudanese', 'president', 'juba', 'security', 'year',
                                             'national', 'state', 'kiir', 'conflict', 'international', 'salva', 'war', 'last', 'agreement',  'first', 'new', 'years',
                                               'support', 'countries', 'political', 'united', 'time', 'million', 'world', 'development', 'civil', 'according', 
                                                 'violence',  'african',  'un', 'riek', 'machar', 'minister', 'africa', 'humanitarian', 'states', 'us', 'forces',
                                                   'work', 'end', 'nations', 'leaders', 'process', 'community']

chosen_keywords_rake_summary_post2017 = ['south', 'sudan', 'government', 'peace', 'president', 'country', 'juba', 'sudanese', 'people', 'security', 'conflict',
                                          'kiir', 'agreement', 'salva', 'national',  'state', 'war', 'international', 'violence', 'un', 'humanitarian', 'countries', 
                                          'million', 'political', 'support', 'civil', 'machar', 'food', 'development', 'african', 'uganda', 'new', 'states', 'united',
                                            'riek', 'unity', 'aid', 'efforts', 'leaders', 'rights', 'ongoing', 'parties', 'transitional', 'meeting', 'year', 'nations',
                                              'minister', 'process', 'forces', 'africa']

In [None]:
chosen_keywords_yake_summary_pre2015 = ['South', 'Sudan', 'Juba', 'President', 'Sudanese', 'State', 'Jonglei', 'Bor', 'Kiir','Abyei', 'Republic', 
                                        'United', 'African', 'Ethiopia', 'Ababa', 'Machar', 'Bentiu', 'Uganda', 'County', 'Malakal', 'Yau', 'Unity', 
                                        'Nile', 'Riek', 'Darfur', 'Lakes','Kenya', 'Security', 'Wau', 'National', 'Rumbek', 'December', 'Khartoum', 'Nuer',
                                'Salva', 'Hon', 'Africa', 'Ghazal', 'East', 'Western', 'Equatoria', 'General','Pibor', 'Human', 'Minister', 'Ministry', 'Council', 'University', 'China', 
                                 'Bahr']

chosen_keywords_yake_paragraphs_pre2015 = ['South', 'Sudan', 'Sudanese', 'Kiir', 'Nations', 'United', 'Salva', 'President',
                                  'African', 'Ababa', 'state', 'Addis', 'State', 'Nile', 'Liberation', 'Union',
                                  'Riek', 'Machar', 'Jonglei', 'National', 'Security', 'Council', 'Equatoria',
                                  'Upper', 'People', 'Movement', 'Tribune', 'Bahr', 'Minister', 'Juba', 'Unity',
                                  'Abyei', 'International', 'East', 'Yau', 'Lakes', 'General', 'Africa', 'president',
                                  'Legislative', 'States', 'Mayardit', 'Central', 'County', 'government', 'Western',
                                  'Kordofan', 'Deng', 'Nuer', 'people']

# From nr 19 onwards it was 18 or less counts.
chosen_keywords_yake_summary_post2015 = ['South', 'Sudan', 'Juba', 'President', 'Uganda', 'Sudanese', 'Machar',
                                         'African', 'United', 'State', 'Ethiopia', 'Kenya', 'Africa', 'Kiir',
                                         'Khartoum', 'Ababa', 'Minister', 'Unity', 'Rwanda', 'Human', 'Malakal',
                                         'Council', 'Somalia', 'Security', 'East', 'Nile', 'Health', 'Republic',
                                         'Equatoria', 'Abyei', 'June', 'Bentiu', 'County', 'Wau', 'Jonglei', 'Police',
                                         'Darfur', 'July', 'August', 'National', 'Egypt', 'Salva', 'Nigeria', 'Rumbek',
                                         'Kenyan', 'December','November', 'Malong', 'Ugandan', 'Development']


chosen_keywords_yake_paragraphs_post2015 = ['South', 'Sudan', 'Sudanese', 'President', 'Nations', 'United', 'Salva', 'Kiir', 'African', 'Machar',
                                            'Riek', 'Council', 'Security', 'Africa', 'Union', 'State', 'National', 'Minister', 'Nile', 'Addis',
                                            'Liberation', 'Ababa', 'East', 'Foreign', 'Juba', 'government', 'peace', 'States', 'Unity', 'Kenyatta',
                                            'Authority', 'International', 'Affairs', 'People', 'Uganda', 'General', 'Upper', 'Uhuru', 'Prime',
                                            'Development', 'Vice', 'Deputy', 'Movement', 'World', 'state', 'people', 'Transitional', 'Kenya',
                                            'Equatoria', 'Deng']



chosen_keywords_rake_summary_pre2015 = ['south', 'sudan', 'government', 'state', 'president', 'sudanese', 'juba', 'people',
                                        'country', 'conflict', 'peace', 'security', 'kiir', 'including', 'salva', 'international',
                                        'countries', 'oil', 'political', 'un', 'national', 'violence', 'support', 'african', 'forces', 'states',
                                        'new', 'united', 'minister', 'issues', 'ongoing', 'development', 'agreement', 'talks',
                                        'humanitarian', 'machar', 'jonglei', 'unity', 'fighting', 'nile', 'border', 'nations', 'highlights',
                                        'riek', 'civilians', 'khartoum', 'leaders', 'army', 'community', 'former']

chosen_keywords_rake_paragraphs_pre2015 = ['sudan', 'south', 'government', 'people', 'juba', 'country', 'state', 'sudanese', 'president',
                                            'peace', 'kiir', 'security', 'new', 'international', 'salva', 'national', 'states',
                                            'conflict', 'united', 'minister', 'political', 'time', 'countries',
                                            'including', 'support', 'war', 'forces', 'general', 'nations', 'development', 'nation',
                                            'first', 'capital', 'african', 'agreement', 'former', 'fighting', 'world', 'khartoum', 'un',
                                            'community', 'army', 'republic', 'members', 'issues', 'areas', 'independence', 'situation',
                                            'violence', 'splm']


chosen_keywords_rake_summary_post2015 = ['south', 'sudan', 'peace', 'government', 'president', 'people', 'country', 'juba', 'sudanese',
                                         'conflict', 'security', 'kiir', 'agreement', 'salva', 'un', 'state', 'war', 'international',
                                         'violence', 'national', 'machar', 'humanitarian', 'million', 'riek', 'countries', 'african',
                                         'political', 'support', 'civil', 'united', 'food', 'uganda', 'unity', 'ongoing', 'leaders',
                                         'states', 'new', 'forces', 'development', 'aid', 'fighting', 'nations', 'vice', 'highlights',
                                         'efforts', 'civilians', 'parties', 'situation', 'crisis', 'council']

chosen_keywords_rake_paragraphs_post2015 = ['sudan', 'south', 'country', 'government', 'people', 'peace', 'president', 'sudanese', 'juba',
                                            'conflict', 'state', 'security', 'kiir', 'war', 'international', 'salva', 'agreement', 'last', 
                                            'national', 'new', 'united', 'machar', 'first', 'riek', 'un', 'countries', 'political', 'african',
                                            'time', 'support', 'million', 'violence', 'nations', 'world', 'according', 'forces', 'civil',
                                            'development', 'fighting', 'end', 'humanitarian', 'former', 'states', 'leaders', 'parties', 'africa',
                                            'minister', 'vice', 'general', 'community']

Visualization 2015

In [None]:
all_keywords_pre2015 =  chosen_keywords_rake_summary_post2015 + chosen_keywords_yake_summary_post2015 + chosen_keywords_rake_summary_pre2015 +  chosen_keywords_yake_summary_pre2015
all_keywords_post2015 = chosen_keywords_yake_paragraphs_post2015 + chosen_keywords_rake_paragraphs_post2015 + chosen_keywords_rake_paragraphs_pre2015 + chosen_keywords_yake_paragraphs_pre2015

#dict of sets
sets = {
    'Pre-2015': set(all_keywords_pre2015),
    'Post-2015': set(all_keywords_post2015),
    'ChatGPT': set(chosen_keywords_chatgpt)}

venn_diagram = venn3(sets.values(), set_labels=sets.keys())

# venn3(sets=sets, asax=ax)
plt.title('Overlapping rake and yake keywords')                                            

Visualization 2017

In [None]:
all_keywords_pre2017 =  chosen_keywords_rake_summary_post2017 + chosen_keywords_yake_summary_post2017 + chosen_keywords_rake_summary_pre2017 +  chosen_keywords_yake_summary_pre2017
all_keywords_post2017 = chosen_keywords_yake_paragraphs_post2017 + chosen_keywords_rake_paragraphs_post2017 + chosen_keywords_rake_paragraphs_pre2017 + chosen_keywords_yake_paragraphs_pre2017

#dict of sets
sets = {
    'Pre-2017': set(all_keywords_pre2017),
    'Post-2017': set(all_keywords_post2017),
    'ChatGPT': set(chosen_keywords_chatgpt)}

venn_diagram = venn3(sets.values(), set_labels=sets.keys())

# venn3(sets=sets, asax=ax)
plt.title('Overlapping rake and yake keywords') 