In [None]:
# This file is created by Michael D. Wang linked to our working paper "Measuring political and economic uncertainty: a supervised computational linguistic approach".

# Help functions to format table and sentiment transfer function
# Format coefficient
def formatCoef(coef, p_value='nan'):
    if coef >= 0:
        sym = ''
    else:
        sym = '-'
        coef = -coef
    x = coef
    head = int(coef)
    coef = coef - head
    if coef < 1:
        x = str('%4.2f'%(head + coef))
    if coef < .1:
        x = str('%4.3f'%(head + coef))
    if coef < .01:
        x = str('%4.4f'%(head + coef))
    if coef < .001:
        x = str('%4.5f'%(head + coef))
    if coef < .0001:
        x = str('%4.6f'%(head + coef))
    if coef < .00001:
        x = str('%4.7f'%(head + coef))
    if coef < .000001:
        x = str('%4.8f'%(head + coef))
    if coef < .0000001:
        x = str('%4.9f'%(head + coef))
    if coef < .00000001:
        x = str('%4.10f'%(head + coef))
        
    if p_value == 'nan':
        star = ''
    elif p_value < .01:
        star = '%s'%('*'*3)
    elif p_value < .05:
        star = '%s' %('*'*2)
    elif p_value < .1:
        star = '%s' %('*')
    else:
        star = ''
    x = sym + x + star
    return x

# Format std error
def formatFloat(fmt, val):
  ret = fmt % val
  if ret.startswith("0."):
    return ret[1:]
  if ret.startswith("-0."):
    return "-" + ret[2:]
  return ret

In [None]:
# 1dependent var: import

import pandas as pd
import numpy as np
import re
from datetime import datetime
import warnings
from linearmodels.panel import compare
import math
from datetime import datetime
warnings.simplefilter("ignore", FutureWarning)

# Set parameter
data_set = 'origion'
sent_rule = 'relative'

data ={'Main regressors' : ["D(T-war)",'', "EPU index",'', "Market return_t-1",'',"Population",'',"D(Suit)",'', "Constant",'', 'R-squared (overall)', 'Observations', 'Source FE', 'Time FE', 'News region', 'Pre-war avg','T-war effect']}
count = 1

dir1 = 'tokens with index/%s topics/Top %s news about %s.csv'
dir3 = 'data/control variables/%s/%s.csv'
dir3a= 'data/political uncertainty index/formalized/%s.csv'
dir4 = 'data/WTO bilateral imports/WTO_CHN_USA.csv'
dir5 = 'data/WTO dispute settlement/US and China.csv'
dir6 = 'data/World Development Indicators/Total population.csv'

for region in ['China']:
    if region == 'US':
        number_topics = 850
        target_region = 'China'
    elif region == 'China':
        number_topics = 550
        target_region = 'US'

    # Sort topic by popularity index
    df = pd.read_csv(dir1 %(number_topics, region, target_region), encoding='utf-8',index_col=0)
    df = df.sort_values(by=['pop_idx'], ascending=False).reset_index()
    
    list = df['topic_index'].values.tolist()

    # Create file
    news_list = ['大公报','文汇报','环球时报','人民网','星岛日报','新京报']

    mainland = ['环球时报','人民网','新京报']

    dir = 'data/lda topics/%s topics/3 predicted topic/Top %s news about %s_%s.csv'

    cdf = pd.DataFrame()

    if region == 'US':
        target = 'China'
    else:
        target = 'US'

    df_list = []
    for section in ['political','economic','financial']:
        df = pd.read_csv(dir%(number_topics, region,target,section), encoding='utf-8')
        df_list.append(df)
    df = pd.concat(df_list).drop_duplicates()
    df = df.loc[df['source'].isin(news_list)]

    for topic in list:
        sdf = df.loc[df['lda_topic']==topic]
        cdf = pd.concat([cdf,sdf]).drop_duplicates().reset_index(drop=True)

    # Create variable for market average sentiment index
    con = pd.DataFrame()
    for source in news_list:
        sub = df.loc[df['source']==source]
        sub = sub[['created_at','sentiment']].groupby('created_at').mean().reset_index().rename(columns={'sentiment': source})
        con = pd.concat([con,sub]).drop_duplicates()
    con = con.groupby('created_at').mean().reset_index()
    con['mean'] = con[news_list].mean(axis=1)

    # Join market index to panel
    df = cdf.merge(con[['created_at','mean']], on='created_at', how='left')

    # select data subset after period
    df['created_at'] = pd.to_datetime(df['created_at'])
    df = df[pd.to_datetime(df['created_at']) > pd.to_datetime('2009')].drop(columns=['tokens']).drop_duplicates().reset_index(drop=True)

    # Add created year and month for matching
    df['created_year'] = df.apply(lambda x: x['created_at'].year, axis=1)
    df['created_month'] = df.apply(lambda x: x['created_at'].strftime('%Y-%m'), axis=1)

    # Add import variable
    df2 = pd.read_csv(dir4, usecols=['Year','Product/Sector Code','Value','Reporting Economy','Partner Economy'])

    if region == "US":
        importer = "United States of America"
    elif region == "China":
        importer = "China"

    def f(year):
        return df2[(df2['Year'] == year) & (df2['Reporting Economy'] == importer)]['Value'].sum()/1000000000000

    df['import_num'] = df.apply(lambda x: f(x['created_year']),axis=1)
    df[df['import_num'] != 0].reset_index(drop = True)

    if data_set == 'origion':
        event = '2016-05-02'
    elif data_set == 'robust':
        event = '2016-11-09'
    
    # Add D_twar
    values = [1]
    conditions = [
        (df['created_at'] > event),
        ]
    df['D_twar'] = np.select(conditions, values, default=0) 

    # for US
    if region == 'US':
    # Add X_1 for political polarity
        values = [1]
        conditions = [
            (df['source'].isin(right)),
            ]
        df['X_political'] = np.select(conditions, values, default=0)

        # Add X_2 for Chinese ratio
        dir = 'data/American Community Survey/US news media location.csv'
        df2 = pd.read_csv(dir).rename(columns={'news': 'source'})
        df = df.merge(df2[['source','ratio']], on='source', how='left').rename(columns={'ratio': 'X_chinese'})

    # for China
    else:
        # Add X_1 for mainland sources
        values = [1]
        conditions = [
            (df['source'].isin(mainland)),
            ]
        df['X_mainland'] = np.select(conditions, values, default=0)     

    df = df.sort_values(by=['created_at']).reset_index(drop=True)

    # Add control variables
    
    # Add D_suit
    values = [1]
    conditions = [
        (df['created_month'].isin(pd.to_datetime(pd.read_csv(dir5)['consultations_date']).dt.strftime('%Y-%m').tolist())),
        ]
    df['D_suit'] = np.select(conditions, values, default=0)    
    
    # Add Population
    df3 = pd.read_csv(dir6)
    df3 = df3[(df3['Country Code'] == 'CHN') | (df3['Country Code'] == 'USA')].reset_index(drop = True)
    if region == 'US':
        idx = 1
    else:
        idx = 0
    df['Population'] = df.apply(lambda x: df3['%s' %x['created_year']][idx], axis = 1)/1000000000

    df['period'] = df['created_at'].dt.to_period('M')

    for variable in ['EPU index', 'Financial market index']:

        if variable == 'EPU index':
            index = '%s_EPU'%region
        else:
            if region == 'US':
                index = ['spx', '000001', '000300'][0]
            else:
                index = ['spx', '000001', '000300'][1]

        if variable == 'EPU index':
            cv = pd.read_csv(dir3a %('PUI_monthly'))

            cv['period'] = cv.apply(lambda x:'%d-%d' % (x['year'],x['month']),axis=1)
            cv['period'] = pd.to_datetime(cv['period']).dt.to_period('M')

            # Add EPU index
            df = df.merge(cv[['period','EPU']], on='period', how='left')
        elif variable == 'Financial market index':
            cv = pd.read_csv(dir3 %(variable,index),usecols=['Date', 'Close', 'Return'])

            cv = cv.rename(columns={'Date': 'created_at', 'Close':'market_idx','Return':'market_return'})
            cv['created_at'] = pd.to_datetime(cv['created_at'])

            # Replace missing date by last value in stock market, replace missing return by zero
            idx = pd.date_range(cv.iloc[0]['created_at'], cv.iloc[-1]['created_at'])
            cv.created_at = pd.DatetimeIndex(cv.created_at)
            cv = cv.set_index(['created_at'])
            cv = cv.reindex(idx)
            cv = cv.fillna(method ='pad')
            cv['market_idx-1'] = cv.shift(1)['market_idx']
            cv['market_return-1'] = cv.shift(1)['market_return']
            cv = cv.reset_index().rename(columns={'index': 'created_at'})

            # Add return of market index
            df = df.merge(cv[['created_at', 'market_return', 'market_return-1', 'market_idx', 'market_idx-1']], on='created_at', how='left')

    df = df.drop(columns='period')
    df = df.sort_values(by=['source']).reset_index(drop=True)

    # Replace missing market return as zero
    df['market_return'] = df['market_return'].fillna(0)

    df = df.dropna()     
    df = df.replace([np.inf, -np.inf], np.nan).dropna()
    df = df.reset_index(drop=True)

    condition = pd.to_datetime(df['created_at']) < pd.to_datetime('2016-05-02')
    prewar_import = df[condition]['import_num'].mean()

    df = df.set_index(['source','created_at'])
    df = df.sort_values(['source', 'created_at'], ascending=[True, True])

    # Panel OLS

    from linearmodels.panel import PanelOLS
    import statsmodels.api as sm

    dependent = df.import_num

    exog = sm.add_constant(df[['D_twar','EPU','market_return-1','Population', 'D_suit']])

    mod = PanelOLS(dependent, exog, entity_effects=None, time_effects=None, drop_absorbed=True)
    res = mod.fit(cov_type='clustered', cluster_entity=False)

    # Help: 
     # res.params: return parameters, res.std_errors: return sd, res.pvalues: return p-value dataframe

    # Compute t-war effect: coef/prewar average sentiment * 100
    coef = res.params['D_twar']
    twar_effect = round(coef/prewar_import * 100,1)

    p_value =  res.pvalues['D_twar']
    if p_value < .01:
        star = '%s'%('*'*3)
    elif p_value < .05:
        star = '%s' %('*'*2)
    elif p_value < .1:
        star = '%s' %('*')
    else:
        star = ''
    twar_effect = str(twar_effect) + star
    print('%s region\'s t-war effect: %s'%(region, twar_effect))

    # Customerized output
    sub_record = []
    for var in ['D_twar', 'EPU', 'market_return-1','Population', 'D_suit', 'const']:
        x = res.params[var]
        p_value = res.pvalues[var]
        # add coef
        sub_record.append(formatCoef(x, p_value))
        # add se
        se = '[%s]' % formatFloat("%.3f", res.std_errors[var])
        sub_record.append(se)

    r5 = res.rsquared_overall
    obs= f"{res.nobs:,d}" 
    x6 = region
    x7 = formatCoef(prewar_import)
    x8 = twar_effect

    record = sub_record + [r5, obs, 'No', 'No', x6, x7, x8]
    data = {**data, **{'%s (%d)' %(region, count): record}}

    # Package output
    reg2 = res
    name2 = '%s' %(region)
    count += 1

print(compare({name2:reg2},precision = 'std_errors'))

In [None]:
pd.DataFrame(data)
print(pd.DataFrame(data).to_latex(index=False))