In [None]:
import pandas as pd
import numpy as np
import pylab as plt
%matplotlib inline 
import seaborn as sns

# Visualizing the hSBM topics

## German data

In [None]:
# loading the dataset which we'll visualize
de_plot_df = pd.read_csv(r'de_topicmodel_plot.csv')

In [None]:
# save the names of the topics in a list
de_names = ['Neutral vaccine tweets', 'Meaningless: Verbs','Mixed: Covid-19 vaccination', 
         'Meaningless: Not vaccine-related','Vaccinated vs. unvaccinated people', 
         'Excitement', 'First and second shot', 
         'Meaningless: Not vaccine-related','Approvals', 
         'Vaccination centres', 'Vaccination of children', 'Brands',
         'Practicalities of getting an appointment', 'Meaningless: Hashtags', 
         'Vaccination passport', 'Anti-vaccine and anti-compulsory vaccination',
         'Mixed: Appointments and conspiracy theory', 'Mixed: (Not) getting vaccinated',
         'Priorisation', 'Anti-vaccine: Demanding vaccination stop', 
            'Meaningless: Not vaccine-related', 'Meaningless: Not vaccine-related', 
            'Brands', 'Risk assessment','Fully vaccinated people']

# save the topic numbers as a string: We will call them Topic 1 - Topic 25
de_topic_no = [f'Topic {i}' for i in de_plot_df['index']]

In [None]:
# add the topic names and numbers to the dataset
de_plot_df.insert(1, "topics", de_names, True)
de_plot_df.insert(1, "topics_no", de_topic_no, True)

In [None]:
# save the colors that each bar should have
# note: the trash topics should appear in grey, the remaining topics should be blue
de_colors = dict(zip(de_names,
                  ['#BC428A', '#BFBCBB', '#BC428A', '#BFBCBB', '#BC428A', 
                   '#BC428A', '#BC428A', '#BFBCBB', '#BC428A', '#BC428A', 
                   '#BC428A', '#BC428A', '#BC428A', '#BC428A', '#BC428A', 
                   '#BC428A', '#BC428A', '#BC428A', '#BC428A', '#BC428A', 
                   '#BFBCBB', '#BFBCBB', '#BC428A', '#B8207A', '#B8207A']))

## Danish data

In [None]:
# loading the dataset which we'll visualize
da_plot_df = pd.read_csv(r'da_topicmodel_plot.csv')

In [None]:
# save the names of the topics in a list
da_names = ['Meaningless topic: Not vaccine related',
            'Meaningless topic: Not vaccine-related',
            'Meaningless topic: Spam tweets',
            'Waitlist',
            'Vaccination of children',
            'Moderna',
            'First vaccine shot excitement',
            'Time of first vaccine shot',
            'Doubts & Questions',
            'Corona passport',
            'International outlook',
            'Meaningless topic: English words',
            'Vaccine appointments',
            'Meaningless topic: Article links ',
            'Status on vaccine plan',
            'Meaningless topic: Random topic and duplicate links',
            'Meaningless topic: Mostly non-danish links',
            'Meaningless topic: Not vaccine-related',
            'Anti-vaccine',
            'Meaningless: Topic 20',
            'Vaccine queue'          
    
]

# save the topic numbers as a string: We will call them Topic 1 - Topic 25
da_topic_no = [f'Topic {i}' for i in da_plot_df['index']]

In [None]:
# add the topic names and numbers to the dataset
da_plot_df.insert(1, "topics", da_names, True)
da_plot_df.insert(1, "topics_no", da_topic_no, True)

In [None]:
da_plot_df

In [None]:
# save the colors that each bar should have
# note: the trash topics should appear in grey, the remaining topics should be blue
da_colors = dict(zip(da_names,
                  ['#BFBCBB', '#BFBCBB', '#BFBCBB', '#338B94', '#338B94', 
                   '#338B94', '#338B94', '#338B94', '#338B94', '#338B94', 
                   '#338B94', '#BFBCBB', '#338B94', '#BFBCBB', '#338B94', 
                   '#BFBCBB', '#BFBCBB', '#BFBCBB', '#338B94', '#BFBCBB', 
                   '#338B94']))

## Polish data

In [None]:
# loading the dataset which we'll visualize
pl_plot_df = pd.read_csv(r'pl_topicmodel_plot.csv')

In [None]:
# save the names of the topics in a list
pl_names = ['Meaningless: Not vaccine-related', 
            'Meaningless: Not vaccine-related', 
            'Meaningless: Not vaccine-related',
            'Vaccines',
            'Vaccination',
            'Announcing getting vaccinated',
            'Getting a vaccine dose',
            'People not wanting to get vaccinated',
            'Effectiveness and risks',
            'Meaningless: Random words',
            'Vaccination lottery',
            'Meaningless: Random tweets',
            'Short-term symptoms after vaccination',
            'Harmful side effects of vaccines',
            'Government initiatives and organization of vaccination campaign',
            'Undesirable post-vaccination reactions',
            'Meaningless: Random tweets',
            'Meaningless: Random tweets',
            'Meaningless: Random tweets',
            'Meaningless: Random tweets',
            'Meaningless: Not vaccine related',
            'Meaningless: Random tweets',
            'Taken/available vaccinations/ which vaccine to take',
            'Epidemic in official numbers',
            'Deaths and complications after receiving the vaccine'           
           ]

# save the topic numbers as a string: We will call them Topic 1 - Topic 25
pl_topic_no = [f'Topic {i}' for i in pl_plot_df['index']]

In [None]:
# add the topic names and numbers to the dataset
pl_plot_df.insert(1, "topics", pl_names, True)
pl_plot_df.insert(1, "topics_no", pl_topic_no, True)

In [None]:
# save the colors that each bar should have
# note: the trash topics should appear in grey, the remaining topics should be blue
pl_colors = dict(zip(pl_names,
                  ['#BFBCBB', '#BFBCBB', '#BFBCBB', '#3B7953', '#3B7953', 
                   '#3B7953', '#3B7953', '#3B7953', '#3B7953', '#BFBCBB', 
                   '#3B7953', '#BFBCBB', '#3B7953', '#3B7953', '#3B7953', 
                   '#3B7953', '#BFBCBB', '#BFBCBB', '#BFBCBB', '#BFBCBB', 
                   '#BFBCBB', '#BFBCBB', '#3B7953', '#3B7953', '#3B7953']))

# Plotting

In [None]:
# specify the font size of the text in the plot
font = {'size': 50}
plt.rc('font', **font)

### All-in-one plot

In [None]:
# PLOTTING

# set style
sns.set_style("whitegrid")

# create the plotting space
fig, (ax1, ax2, ax3) = plt.subplots(3, figsize=(50,150))

# Danish data
# turn labels around
da = da_plot_df[::-1]

# generate the bar plot and set the colors according to what we specified in the 'color' dictionary 
ax1.barh(da['topics_no'], da['prevalence'], color=[da_colors[x] for x in da['topics']])

# set xlim so that all the labels are properly shown
xlim1 = ax1.get_xlim()
ax1.set_xlim(right = xlim1[1] + 0.05)

# set the xaxis labels
ax1.set_xlabel('Topic prevalence over all documents')

# set title
ax1.set_title('Danish data: 25 most prevalent topics')

# Remove the frame: this is the despine method
sns.despine(left=True)

# we only want vertical lines to be shown, so we turn the gridlines on the yaxis off
ax1.yaxis.grid(False) # Show the vertical gridlines

# put the labels into the bars

# iterate through a dictionary of topic names and their prevalence 
for i, (topic, prevalence) in enumerate(zip(da['topics'], da['prevalence'])):
    
    # add each label to the left of the bar
    ax1.text(prevalence, i-0.15, '   ' + topic, ha='left')
    

# German data
# turn labels around
de = de_plot_df[::-1]

# generate the bar plot and set the colors according to what we specified in the 'color' dictionary 
ax2.barh(de['topics_no'], de['prevalence'], color=[de_colors[x] for x in de['topics']])

# set xlim so that all the labels are properly shown
xlim2 = ax2.get_xlim()
ax2.set_xlim(right = xlim2[1] + 0.03)

# set the xaxis labels
ax2.set_xlabel('Predicted topic proportions')

# set title
ax2.set_title('German data: 25 most prevalent topics')

# Remove the frame: this is the despine method
sns.despine(left=True)

# we only want vertical lines to be shown, so we turn the gridlines on the yaxis off
ax2.yaxis.grid(False) # Show the vertical gridlines

# put the labels into the bars

# iterate through a dictionary of topic names and their prevalence 
for i, (topic, prevalence) in enumerate(zip(de['topics'], de['prevalence'])):
    
    # add each label to the left of the bar
    ax2.text(prevalence, i-0.15, '   ' + topic, ha='left')
    

# Polish data
# turn labels around
pl = pl_plot_df[::-1]

# generate the bar plot and set the colors according to what we specified in the 'color' dictionary 
ax3.barh(pl['topics_no'], pl['prevalence'], color=[pl_colors[x] for x in pl['topics']])

# set xlim so that all the labels are properly shown
xlim3 = ax3.get_xlim()
ax3.set_xlim(right = xlim3[1] + 0.05)

# set the xaxis labels
ax3.set_xlabel('Predicted topic proportions')

# set title
ax3.set_title('Polish data: 25 most prevalent topics')

# Remove the frame: this is the despine method
sns.despine(left=True)

# we only want vertical lines to be shown, so we turn the gridlines on the yaxis off
ax3.yaxis.grid(False) # Show the vertical gridlines

# put the labels into the bars

# iterate through a dictionary of topic names and their prevalence 
for i, (topic, prevalence) in enumerate(zip(pl['topics'], pl['prevalence'])):
    
    # add each label to the left of the bar
    ax3.text(prevalence, i-0.15, '   ' + topic, ha='left')

### Separate plots

In [None]:
# specify the font size of the text in the plot
font = {'size': 40}
plt.rc('font', **font)

In [None]:
# PLOTTING

# set style
sns.set_style("whitegrid")

# create the plotting space
fig_da, ax = plt.subplots(figsize=(40,25))

# Danish data
# turn labels around
da = da_plot_df[::-1]

# generate the bar plot and set the colors according to what we specified in the 'color' dictionary 
ax.barh(da['topics_no'], da['prevalence'], color=[da_colors[x] for x in da['topics']])

# set xlim so that all the labels are properly shown
xlim = ax.get_xlim()
ax.set_xlim(right = xlim[1] + 0.07)

# set the xaxis labels
ax.set_xlabel('Predicted topic proportions')

# set title
ax.set_title('Danish data: 25 most prevalent topics')

# Remove the frame: this is the despine method
sns.despine(left=True)

# we only want vertical lines to be shown, so we turn the gridlines on the yaxis off
ax.yaxis.grid(False) # Show the vertical gridlines

# put the labels into the bars

# iterate through a dictionary of topic names and their prevalence 
for i, (topic, prevalence) in enumerate(zip(da['topics'], da['prevalence'])):
    
    # add each label to the left of the bar
    ax.text(prevalence, i-0.15, '   ' + topic, ha='left')


In [None]:
# German data

# set style
sns.set_style("whitegrid")

# create the plotting space
fig_de, ax = plt.subplots(figsize=(40,25))

# turn labels around
de = de_plot_df[::-1]

# generate the bar plot and set the colors according to what we specified in the 'color' dictionary 
ax.barh(de['topics_no'], de['prevalence'], color=[de_colors[x] for x in de['topics']])

# set xlim so that all the labels are properly shown
xlim2 = ax.get_xlim()
ax.set_xlim(right = xlim2[1] + 0.03)

# set the xaxis labels
ax.set_xlabel('Predicted topic proportions')

# set title
ax.set_title('German data: 25 most prevalent topics')

# Remove the frame: this is the despine method
sns.despine(left=True)

# we only want vertical lines to be shown, so we turn the gridlines on the yaxis off
ax.yaxis.grid(False) # Show the vertical gridlines

# put the labels into the bars

# iterate through a dictionary of topic names and their prevalence 
for i, (topic, prevalence) in enumerate(zip(de['topics'], de['prevalence'])):
    
    # add each label to the left of the bar
    ax.text(prevalence, i-0.15, '   ' + topic, ha='left')


In [None]:
# Polish data

# set style
sns.set_style("whitegrid")

# create the plotting space
fig_pl, ax = plt.subplots(figsize=(40,25))

# turn labels around
pl = pl_plot_df[::-1]

# generate the bar plot and set the colors according to what we specified in the 'color' dictionary 
ax.barh(pl['topics_no'], pl['prevalence'], color=[pl_colors[x] for x in pl['topics']])

# set xlim so that all the labels are properly shown
xlim = ax3.get_xlim()
ax.set_xlim(right = xlim3[1] + 0.07)

# set the xaxis labels
ax.set_xlabel('Predicted topic proportions')

# set title
ax.set_title('Polish data: 25 most prevalent topics')

# Remove the frame: this is the despine method
sns.despine(left=True)

# we only want vertical lines to be shown, so we turn the gridlines on the yaxis off
ax.yaxis.grid(False) # Show the vertical gridlines

# put the labels into the bars

# iterate through a dictionary of topic names and their prevalence 
for i, (topic, prevalence) in enumerate(zip(pl['topics'], pl['prevalence'])):
    
    # add each label to the left of the bar
    ax.text(prevalence, i-0.15, '   ' + topic, ha='left')

In [None]:
# save the graph
fig.savefig('all_hsbm_fig.png')

In [None]:
# save the three separate figures
fig_da.savefig('da_hsbm_fig.png')
fig_de.savefig('de_hsbm_fig.png')
fig_pl.savefig('pl_hsbm_fig.png')