# Import and format MP data datasets for SCHOLAR

In [1]:
import os, sys, scipy, json
from scipy import sparse
import codecs
import numpy as np
import pandas as pd
import file_handling as fh

# MP speeches

## load booking data and save in SCHOLAR format

In [3]:
if sys.platform == "darwin":
    pass
else:
    raw_data_path = "/nfs/home/maxa/data/mp_speeches/"
    output_dir = raw_data_path + "scholar/"
try:
    os.mkdir(output_dir)
except:
    pass
print(raw_data_path, "\n",output_dir)

/nfs/home/maxa/data/mp_speeches/ 
 /nfs/home/maxa/data/mp_speeches/scholar/


## vocab

In [7]:
vocab_df = pd.read_csv(raw_data_path +'/datasets_by_meet_date/alltextvocab_tokens_tfidf17.csv', header = None)
vocab = list(vocab_df[1])
len(vocab)

2034

In [9]:
vocab[:20]

['about',
 'abroad',
 'absence',
 'absorb',
 'academic',
 'accelerate',
 'accelerated',
 'acceleration',
 'accept',
 'access',
 'accommodation',
 'accommodative',
 'accompanied',
 'accord',
 'accordingly',
 'accountability',
 'accounted',
 'accounting',
 'accounts',
 'accumulation']

In [10]:
fh.write_to_json(vocab, output_dir + "train.vocab.json", indent=2, sort_keys=True)

## dtm

In [11]:
x_bow_raw = np.load(raw_data_path + "datasets_by_meet_date/REGALLTEXT_dtm_unigrams_alltext_tfidf17.npy")

In [13]:
x_bow_raw.shape

(145, 2034)

In [14]:
x_bow = np.matrix(x_bow_raw)

In [15]:
cut = int(1.0*x_bow.shape[0])
cut

145

In [16]:
x_bow_train = x_bow[:cut] 
x_bow_test = x_bow[cut:]
x_bow_train.shape, x_bow_test.shape

((145, 2034), (0, 2034))

In [17]:
# train
sparse_Xtr = sparse.coo_matrix(x_bow_train).tocsr()
fh.save_sparse(sparse_Xtr, os.path.join(raw_data_path, "scholar", "train.npz"))
# test
sparse_Xte = sparse.coo_matrix(x_bow_test).tocsr()
fh.save_sparse(sparse_Xte, os.path.join(raw_data_path, "scholar", "test.npz"))

## doc IDs

In [18]:
train_ids = ["train_" + str(x) for x in list(range(x_bow_train.shape[0]))]
test_ids = ["test_" + str(x) for x in list(range(x_bow_test.shape[0]))]

In [19]:
fh.write_to_json(train_ids, output_dir + "train.ids.json", indent=2, sort_keys=True)
fh.write_to_json(test_ids, output_dir + "test.ids.json", indent=2, sort_keys=True)

## labels & covariates

In [24]:
df = pd.read_csv(raw_data_path + "/datasets_by_meet_date/REGALLTEXT_gb_dataset.csv", header = 0, index_col = 0)
df.head(3)

Unnamed: 0,meet_date,GB_date,pt,idMM,header,text,gb_date,fomc_date,yq,unemp0,...,L1cpix_fe3,L1cpix_fe4,L1rgdp_fe1,L1rgdp_fe2,L1rgdp_fe3,L1rgdp_fe4,L1pgdp_fe1,L1pgdp_fe2,L1pgdp_fe3,L1pgdp_fe4
0,1990-02-07,1990-01-31,pt2,Int.DmdEME,Developments in East European Economies,Poland and Yugoslavia have im...,1990-01-31,1990-02-07,1990-01-01,5.5,...,1.4,-0.3,-0.1,0.1,-0.2,-4.7,1.7,0.1,0.2,0.1
1,1990-03-27,1990-03-21,pt2,Fin.Govt,Municipal Securities,Gross issuance of long-term munici...,1990-03-21,1990-03-27,1990-01-01,5.3,...,-0.4,2.2,-1.3,-0.2,-4.8,-4.5,0.7,0.0,0.3,-0.1
2,1990-05-15,1990-05-09,pt1,For.Ec.Summary,Near-term Economic Conditions,Data received since the March ...,1990-05-09,1990-05-15,1990-04-01,5.4,...,-0.5,2.1,-1.3,-0.3,-4.8,-4.1,0.9,0.1,-0.1,0.1


In [25]:
# define variables
d_u = df.unemp4 - df.unemp0
L1d_u = df.L1unemp4 - df.L1unemp0

d_cpi = df.cpi4 - df.cpi0
L1d_cpi = df.L1cpi4 - df.L1cpi0

d_rgdp = df.rgdp4 - df.rgdp0
L1d_rgdp = df.L1rgdp4 - df.L1rgdp0

In [75]:
names = ["u","cpi","gdp"]
for idx, tv in enumerate([d_u, d_cpi, d_rgdp]):
    y = tv
    x = df[["L1unemp0","L1cpi0","L1rgdp0"]]

    # de-mean data to have fair comparison againt pure OLS
    y = y - np.mean(y)
    x = x - np.mean(x)

    train_y = y[:cut]
    train_y.index = train_ids
    train_y.to_csv(output_dir + "train.target_{}.csv".format(names[idx]))
    
    test_y = y[cut:]
    test_y.index = test_ids
    test_y.to_csv(output_dir + "test.target_{}.csv".format(names[idx]))
    
    train_covars = x[:cut]
    train_covars.index = train_ids
    train_covars.to_csv(output_dir + "train.covars_{}.csv".format(names[idx]))
    
    test_covars = x[cut:]
    test_covars.index = test_ids
    test_covars.to_csv(output_dir + "test.covars_{}.csv".format(names[idx]))
    
    print(train_y.shape, train_covars.shape, test_y.shape, test_covars.shape)

(145,) (145, 3) (0,) (0, 3)
(145,) (145, 3) (0,) (0, 3)
(145,) (145, 3) (0,) (0, 3)
