In [None]:
# This file is created by Michael D. Wang linked to our working paper "Measuring political and economic uncertainty: a supervised computational linguistic approach".

# 1 PUI
import csv
import numpy as np
import pandas as pd

# Set parameter
obs_period = 'monthly'

# Set data directory
input_dir = "annotated/valuated_contents.csv"
output_dir = "data/political uncertainty index/PUI_raw_%s.csv" %obs_period
output_token_dir = "data/matlab workfile/%s_news_token.csv" %obs_period
output_sort_dir = "data/political uncertainty index/PUI_%s.csv" %obs_period

# Create output file
with open(output_dir, "w", newline = '',encoding = 'utf-8') as csvfile:
    w = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)
    header = ['date','relation_index', 'uncertainty_index', 'keyword_1', 'keyword_2','keyword_3','keyword_4','keyword_5']
    w.writerow(header)
    
with open(output_token_dir, "w", newline = '',encoding = 'utf-8') as csvfile:
    w = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)
    header = ['date','tokens']
    w.writerow(header)

In [None]:
import jieba
import unicodedata
import string

def cut(text_str):
    # stopwords
    stopword_dir = "Stopwords_Chinese.txt"
    sw_list = []
    f = open(stopword_dir, "r", encoding='utf-8-sig').read().splitlines()
    for word in f:
        sw_list.append(word)
    text = unicodedata.normalize('NFKC', text_str)
    seg_list = jieba.cut(text, cut_all = False, HMM = True)
    seg_list = [word.lower() for word in seg_list if word not in sw_list # remove stopwords
                and word not in string.punctuation # remove punctuation
                and not word.isnumeric() # remove digits
                and word not in ['\ue5e5',' ']] 
    text1 = " ".join(seg_list)
    return text1.split(" ") 

In [None]:
from datetime import datetime

# Read file
df = pd.read_csv(input_dir,encoding='utf-8')#, nrows=10)

# Create time series dictionary
date_dict = {}
l1 = df['created_at'].tolist()

# convert date to month
if obs_period == 'monthly':
    l1 = [datetime.strftime(datetime.strptime(x,'%Y-%m-%d'),'%Y-%m') for x in l1]
# convert date to year
if obs_period == 'yearly':
    l1 = [datetime.strftime(datetime.strptime(x,'%Y-%m-%d'),'%Y') for x in l1]

l2 = list(set(l1))
l2.sort(key=l1.index)
for index,key in enumerate(l2):
    date_dict[key]=[]

for index,row in df.iterrows():
    created_at = row['created_at']
    decoded_time = datetime.strptime(row['created_at'],'%Y-%m-%d')
    if obs_period == 'monthly':
        created_at = datetime.strftime(decoded_time,'%Y-%m')
    if obs_period == 'yearly':
        created_at = datetime.strftime(decoded_time,'%Y')

    label1 = row['topic_country']
    label2 = row['sentiment']
    arousal = row['arousal_degree']
    text = row['title'] + row['content']

    # ignore irrelevant records
    if label1 == 1:
        record = [label2, arousal, text]
        date_dict[created_at].append(record)

In [None]:
from collections import Counter

k = 5
j = 1
for key in date_dict:
    print('Current date: %s'%key)
    index1 = 0
    distance = 0
    text = []
    keyword = []

    # compute relation index
    for record in date_dict[key]:
        label2 = record[0]
        arousal = record[1]
        # add arousal if news is non-negative
        if label2 == 1:
            index1 += arousal
        else:
            index1 -= arousal
    try:
        relation_index = index1/len(date_dict[key])
    except ZeroDivisionError:
        relation_index = 'nan'
    
    # compute uncertainty index 
    for record in date_dict[key]:
        arousal = record[1]
        # use l2 distance compute volatility of arousal
        distance += (arousal - relation_index)**2
        text += cut(record[2])
    try:
        uncertainty_index = distance/len(date_dict[key])
    except ZeroDivisionError:
        uncertainty_index = 'nan'
#     print('Uncertainty index is %.2f'%uncertainty_index)
    with open(output_token_dir, "a", newline = '',encoding = 'utf-8') as csvfile:
        w = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)
        output = [key,' '.join(text)]
        w.writerow(output)
    
    # create vocabulary dictionary
    word_dict = {}
    word_list = list(set(text))
    for index,w in enumerate(word_list):
        word_dict[w]=1
    # count frequency of words in the dictionary
    for word in word_dict:
        word_dict[word] = text.count(word)
    # sort word dictioinary values by frequency and output top k keys
    for i in range(0,k):
        try:
            keyword.append(Counter(word_dict).most_common()[i][0])
        except:
            keyword = ['nan'] * k
    
    # save result to output file
    with open(output_dir,"a",encoding='utf-8') as csvfile:
        w = csv.writer(csvfile)
        output = [key,relation_index, uncertainty_index, keyword[0], keyword[1], keyword[2], keyword[3], keyword[4]]
        w.writerow(output)
        
    if j%10 == 0:
        print('Processing %s out of %s...'%(j,len(date_dict)))
    j += 1

In [None]:
# read and save sorted result to output file
df = pd.read_csv(output_dir)
df.sort_values('date', inplace=True,ascending=True)
df.to_csv(output_sort_dir, index=False)