## Author Topic Model, Results

### Setup

In [1]:
###########################################################
# TOPIC MODELING - Author Topic Model, Display Results
# Author: Luca Adorni
# Date: May 2023
###########################################################

# 0. Setup -------------------------------------------------

import re
import nltk
import gensim
import os
import sys
import altair as alt
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm

import warnings
import datetime as dt
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.simplefilter("ignore")



pd.options.display.max_columns = 200
pd.options.display.max_rows = 1000
pd.set_option('max_info_columns', 200)
pd.set_option('expand_frame_repr', False)
pd.set_option('expand_frame_repr', True)
pd.set_option('max_colwidth',1000)
pd.set_option('display.width',None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Setup Repository
try:
    # Setup Repository
    with open(f"/Users/{os.getlogin()}/Documents/polpo/repo_info.txt", "r") as repo_info:
        path_to_repo = repo_info.readline()
except:
    path_to_repo = f"{os.getcwd()}/polpo/"
    sys.path.append(f"{os.getcwd()}/.local/bin") # import the temporary path where the server installed the module

  
print(path_to_repo)

path_to_data = f"{path_to_repo}data/"
path_to_raw = f"{path_to_data}raw/"
path_to_links = f"{path_to_data}links/"
path_to_processed = f"{path_to_data}processed/"
path_to_figures = f"{path_to_repo}figures/"
path_to_figure_odds = f"{path_to_figures}logit_res/"
path_to_tables = f"{path_to_repo}tables/"
path_to_ctm = f"{path_to_data}ctm/"
path_to_lda = f"{path_to_data}lda/"
path_to_gsdmm = f"{path_to_data}gsdmm/"
path_to_author = f"{path_to_data}author/"

/Users/ADORNI/Dropbox (BFI)/LUCA/polpo/


### Load Data

In [5]:
# Load the Author-Topic-Model export with the macro-labels for each topic
df = pd.read_csv(f"{path_to_author}final/resulting_topics_periods_final.csv")

# Restrict to the columns we want
df = df[['far_left', 'center_left', 'center', 'center_right', 'far_right', 'month', 'macro']]

# Get aggregates per macro category
df = df.groupby(['month', 'macro'], as_index = False).sum()

In [6]:
df

Unnamed: 0,month,macro,far_left,center_left,center,center_right,far_right
0,first_lock,Cases,0.092,0.099,0.1,0.103,0.115
1,first_lock,China/Xenophobia,0.17,0.131,0.157,0.174,0.092
2,first_lock,Economics,0.062,0.035,0.083,0.066,0.083
3,first_lock,Faults,0.046,0.053,0.058,0.052,0.096
4,first_lock,Lockdown,0.359,0.354,0.294,0.288,0.324
5,first_lock,Other,0.271,0.329,0.309,0.317,0.29
6,post_lock,Cases,0.111,0.051,0.044,0.029,0.097
7,post_lock,Economics,0.243,0.221,0.32,0.259,0.266
8,post_lock,Faults,0.091,0.127,0.144,0.157,0.101
9,post_lock,Lockdown,0.237,0.265,0.279,0.362,0.301


In [7]:
# Melt the dataframe
df = df.melt(id_vars = ['month', 'macro'])
# and order it
df['order'] = np.nan
df.loc[df.macro == 'Lockdown', 'order'] = 1
df.loc[df.macro == 'Cases', 'order'] = 2
df.loc[df.macro == 'Faults', 'order'] = 3
df.loc[df.macro == 'China/Xenophobia', 'order'] = 4
df.loc[df.macro == 'Economics', 'order'] = 5
df.loc[df.macro == 'Other', 'order'] = 6


# Get everything to 100
df['value'] = df['value']/df.groupby(['month','variable'])['value'].transform('sum')

# Fix the label names
df.month.replace({'first_lock': 'First Lockdown', 'post_lock': "End of Lockdown", 
                 'summer': 'Summer', 'second_lockdown': 'Second Lockdown'}, inplace = True)
df.variable.replace({'far_left':'Far Left', 'center_left':"Center Left", 'center':'Center', 
                    'center_right': 'Center Right', 'far_right':"Far Right"}, inplace = True)

### Display the topics over the four time periods

In [3]:

alt.Chart(df).mark_bar().encode(

    # tell Altair which field to group columns on
    x=alt.X('variable:N', title=None, 
            sort = ['Far Left', 'Center Left', 'Center', 'Center Right', 'Far Right'], axis=alt.Axis(labelAngle=-45)),

    # tell Altair which field to use as Y values and how to calculate
    y=alt.Y('sum(value):Q',
        axis=alt.Axis(
            grid=False,
            title="Topic distribution")),

    # tell Altair which field to use to use as the set of columns to be  represented in each group
    column=alt.Column('month:N', title=None, sort = ['First Lockdown', 'End of Lockdown', 'Summer', 'Second Lockdown'], 
                      header=alt.Header(titleFontSize=20, labelFontSize=13)),
    order = 'order',
    # tell Altair which field to use for color segmentation 
    color=alt.Color('macro:N',
            scale=alt.Scale(
                # make it look pretty with an enjoyable color pallet
               domain = ['Lockdown', 'Cases', 'Faults', 'China/Xenophobia', 'Economics', 'Other'],
               range = ['#1f78b4', '#a6cee3', '#b2df8a', '#fdbf6f', '#fb8072', '#d9d9d9']
            ),
        ).title("Topics")
    )\
    .configure_view(
        # remove grid lines around column clusters
        strokeOpacity=0    
    ).properties(
    width=150,
    height=350
    ).configure_axis(
        labelFontSize=12,
        titleFontSize=13
    ).configure_legend(
    titleFontSize=13,
    labelFontSize=12
    )