In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import random
import torch
import spacy
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_
import torch.nn.functional as F
import string
from torch.utils.data import Dataset
from collections import Counter
import os
from scipy.stats import spearmanr
from nltk.stem import PorterStemmer
import pickle as pkl

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
torch.cuda.set_device(3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PAD_IDX = 0
UNK_IDX = 1
MAX_SENTENCE_LENGTH = 20000


In [4]:
print('Using: ', torch.cuda.current_device())

Using:  3


### read data

In [5]:
# read stock as dictionary
def read_stock(year='2013'):
    path = 'data/all.logvol/' + year + '.logvol.+12.txt'
    stock_dic = {}
    for line in open(path):
        v, k = line.rstrip().split()
        v = float(v)
        stock_dic[k] = v
    return stock_dic

In [6]:
# read report and combine with stock
def read_report(year='2013'):
    path = 'data/all.tok/'+year+'.tok/'
    stock_dic = read_stock(year)
    files =os.listdir(path)
    all_txt = []
    all_targets = []
    for file in files:
        report_name = file.split('.')[0]
        text = [line.rstrip() for line in open(path+file)]
        all_txt.extend(text)
        all_targets.append(stock_dic[report_name])
    return all_txt, all_targets

In [7]:
train_data = []
train_targets = []
for year in ['2008', '2009','2010']:
    all_txt, all_targets = read_report(year=year)
    train_data.extend(all_txt)
    train_targets.extend(all_targets)

In [8]:
val_data, val_targets = read_report(year='2011')

### tokenize and build vocab

In [9]:
tokenizer = spacy.load('en_core_web_sm')
stemmer = PorterStemmer()

#create n-gram datasets
def tokenize_dataset(dataset, n_gram=1):
    token_dataset = []
        # we are keeping track of all tokens in dataset 
        # in order to create vocabulary later
    all_tokens = []
    for sample in dataset:
        tokens = sample.split()
        n_tokens = []
        for i in range(len(tokens)-n_gram+1): 
            n_token = ' '.join(tokens[i:i+n_gram])
            n_tokens.append(stemmer.stem(n_token))
        token_dataset.append(n_tokens)
        all_tokens += n_tokens
    return token_dataset, all_tokens

def build_vocab(all_tokens, max_vocab_size = None):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    # save index 0 for unk and 1 for pad
    PAD_IDX = 0
    UNK_IDX = 1
    token_counter = Counter(all_tokens)
    if not max_vocab_size:
        max_vocab_size = len(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data


### build data loader

In [10]:
class Data(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []

    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)).to(device), torch.LongTensor(length_list).to(device), torch.DoubleTensor(label_list).to(device)]


In [11]:
# # train set tokens
# print ("Tokenizing train data")
# train_data_tokens, all_train_tokens = tokenize_dataset(train_data)
# pkl.dump(train_data_tokens, open("data/train_data_tokens.p", "wb"))
# pkl.dump(all_train_tokens, open("data/all_train_tokens.p", "wb"))
# # val set tokens
# print ("Tokenizing val data")
# val_data_tokens, _ = tokenize_dataset(val_data)
# pkl.dump(val_data_tokens, open("data/val_data_tokens.p", "wb"))

# run code above if these files do not exist
train_data_tokens = pkl.load(open("data/train_data_tokens.p", "rb"))
all_train_tokens = pkl.load(open("data/all_train_tokens.p", "rb"))

val_data_tokens = pkl.load(open("data/val_data_tokens.p", "rb"))


In [12]:
token2id, id2token = build_vocab(all_train_tokens, max_vocab_size=5000)
train_data_indices = token2index_dataset(train_data_tokens)
val_data_indices = token2index_dataset(val_data_tokens)
#test_data_indices = token2index_dataset(test_data_tokens)


BATCH_SIZE = 10
train_dataset = Data(train_data_indices, train_targets)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=collate_func,
                                           shuffle=True)

val_dataset = Data(val_data_indices, val_targets)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=collate_func,
                                           shuffle=True)
# no need for test data right now
# test_dataset = NewsGroupDataset(test_data_indices, test_targets)
# test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
#                                            batch_size=BATCH_SIZE,
#                                            collate_fn=newsgroup_collate_func,
#                                            shuffle=False)


In [13]:
len(id2token)

5002

## load word vectors

In [15]:
word_vecs = open('data/syn.expand.200d.vec').readlines()

In [16]:
raw_embedding = {}
for vec in word_vecs:
    word, wordvec = vec.split()[0], vec.split()[1:]
    word = word.split('_')[0]
    assert len(wordvec)==200, "embedding size is not right"
    raw_embedding[word] = wordvec

In [17]:
embedding_matrix = np.zeros((len(id2token), 200))
for word, i in token2id.items():
    embedding_vector = raw_embedding.get(word)
    if embedding_vector:
        embedding_matrix[i] = embedding_vector
    elif word=='<unk>':
        embedding_matrix[i] = np.random.uniform(-0.25, 0.25, 200)
    else:
        embedding_matrix[i] = np.zeros(200)

In [18]:
embedding_matrix.shape

(5002, 200)

In [19]:
def train_model(loader, model, every=20, learning_rate=1e-3, num_epochs=10, predict_type='baseline'):
    # Criterion and Optimizer
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    # Train the model
    total_step = len(loader)
    for epoch in range(num_epochs):
        for i, (data, lengths, labels) in enumerate(loader):
            model.train()
            optimizer.zero_grad()
            # Forward pass
            outputs = model(data, lengths)
            loss = criterion(outputs, labels)
            # Backward and optimize
            loss.backward()
            optimizer.step()
            # validate every n iterations
            if i > 0 and i % every == 0:
                # validate
                loss_val, cor = test_model(val_loader, model)
                print('Epoch: [{}/{}], Step: [{}/{}], Training batch loss: {}, Validation loss: {}, Validation correlation: {}'.format(
                           epoch+1, num_epochs, i+1, len(train_loader), loss.item(), loss_val, cor))
    torch.save(model.state_dict(), '{}_14epoch.pth'.format(predict_type))
    
    
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    criterion = torch.nn.MSELoss()
    model.eval()
    predictions = []
    targets = []
    for data, lengths, labels in loader:
        data_batch, lengths_batch, label_batch = data, lengths, labels
        outputs = model(data_batch, lengths_batch)
        predictions.extend(outputs.view(-1).tolist())
        targets.extend(label_batch.tolist())
    loss = criterion(torch.tensor(predictions), torch.tensor(targets))
    correlation = spearmanr(predictions, targets).correlation
    return loss, correlation



## CNN-baseline

In [None]:
class CNN(nn.Module):
    def __init__(self, emb_size=200, num_filters=100, num_chunks=100, filter_sizes=(3, 4, 5), dropout=0.5, pretrained=False):
        super(CNN, self).__init__()
        if pretrained:
            self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embedding_matrix), freeze=True)
        else:
            self.embedding = nn.Embedding(len(id2token), emb_size, padding_idx = PAD_IDX)
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, num_filters, (k, emb_size)) for k in filter_sizes])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters * len(filter_sizes), 1)
        self.fc2 = nn.Linear(num_chunks, 1)
        self.num_chunks = num_chunks

    
    def chunk_maxpool(self, x, dim, k):
        # maxpool over chunks and shrink size to k
        chunks = torch.chunk(x, chunks=k, dim=dim)
        chunks = [torch.max(ts, dim=dim).values.unsqueeze(dim) for ts in chunks]
        return torch.cat(chunks, dim=dim)


    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)
        #x = F.max_pool1d(x, x.size(2)).squeeze(2)
        x = self.chunk_maxpool(x, 2, self.num_chunks)
        return x
    
    def forward(self, x, lengths):
        embed = self.embedding(x)
        out = embed.unsqueeze(1)
        out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1)
        #print('size after concat: ', out.size())
        out = self.dropout(out).permute(0,2,1)
        out = self.fc(out).squeeze(2)
        #print ('size after fc: ', out.size())
        out = self.fc2(out).squeeze(1)
        #print ('output size: ', out.size())
        return out
    

In [20]:
model = CNN().double().to(device)

In [21]:
# 100 chunks
train_model(train_loader, model, every=200, learning_rate = 1e-3, num_epochs = 30)

## CNN-pretrained-conv-layer

In [None]:
class CnnStc(nn.Module):
    def __init__(self, emb_size=200, num_filters=100, num_chunks=100, filter_sizes=(3, 4, 5), dropout=0.5, pretrained=False, conv_static=False):
        super(CnnStc, self).__init__()
        if pretrained:
            self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embedding_matrix), freeze=False)
        else:
            self.embedding = nn.Embedding(len(id2token), emb_size, padding_idx = PAD_IDX)
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, num_filters, (k, emb_size)) for k in filter_sizes])
        if conv_static:
            conv.weight.requires_grad = False
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters * len(filter_sizes), 1)
        self.fc2 = nn.Linear(num_chunks, 1)
        self.num_chunks = num_chunks

    
    def chunk_maxpool(self, x, dim, k):
        # maxpool over chunks and shrink size to k
        chunks = torch.chunk(x, chunks=k, dim=dim)
        chunks = [torch.max(ts, dim=dim).values.unsqueeze(dim) for ts in chunks]
        return torch.cat(chunks, dim=dim)


    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)
        #x = F.max_pool1d(x, x.size(2)).squeeze(2)
        x = self.chunk_maxpool(x, 2, self.num_chunks)
        return x
    
    def forward(self, x, lengths):
        embed = self.embedding(x)
        out = embed.unsqueeze(1)
        out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1)
        #print('size after concat: ', out.size())
        out = self.dropout(out).permute(0,2,1)
        out = self.fc(out).squeeze(2)
        #print ('size after fc: ', out.size())
        out = self.fc2(out).squeeze(1)
        #print ('output size: ', out.size())
        return out
    

### NTC

In [24]:
pretrained_model_dict = torch.load('data/conv_nonstatic_fc_0_5_dropout.pt', map_location=device)
pretrained_model = CnnStc(pretrained=True).double().to(device)
for i in range(len(pretrained_model.convs)):
    with torch.no_grad():
        pretrained_model.convs[i].weight.copy_(pretrained_model_dict['convs.{}.weight'.format(i)].permute(0,2,1).unsqueeze(1))
        

In [25]:
train_model(train_loader, pretrained_model, every=200, learning_rate = 1e-3, num_epochs = 13, predict_type='NTC')

Epoch: [1/13], Step: [201/752], Training batch loss: 0.10735624238781864, Validation loss: 0.2934154272079468, Validation correlation: 0.13832066368942675
Epoch: [1/13], Step: [401/752], Training batch loss: 0.25415098939686687, Validation loss: 0.3476417660713196, Validation correlation: 0.16204965981223235
Epoch: [1/13], Step: [601/752], Training batch loss: 0.2791187567648869, Validation loss: 0.18884383141994476, Validation correlation: 0.19462689508667178
Epoch: [2/13], Step: [201/752], Training batch loss: 0.2934287824217342, Validation loss: 0.2484191507101059, Validation correlation: 0.3111630003911598
Epoch: [2/13], Step: [401/752], Training batch loss: 0.4884333977138745, Validation loss: 0.17875026166439056, Validation correlation: 0.3291538867540596
Epoch: [2/13], Step: [601/752], Training batch loss: 0.3071800188497012, Validation loss: 0.16299310326576233, Validation correlation: 0.38122499981952446
Epoch: [3/13], Step: [201/752], Training batch loss: 0.3074204030305283, 

In [26]:
pretrained_model = CnnStc(pretrained=True).double().to(device)
pretrained_model.load_state_dict(torch.load('NTC_14epoch.pth'))
pretrained_model.eval()
test_model(val_loader, pretrained_model)

(tensor(0.1695), 0.5501968749609781)

### STC

In [None]:
stc_model_dict = torch.load('data/conv_nonstatic_fc_0_5_dropout.pt', map_location=device)
stc_model = CnnStc(pretrained=True, conv_static=True).double().to(device)
for i in range(len(stc_model.convs)):
    with torch.no_grad():
        stc_model.convs[i].weight.copy_(stc_model_dict['convs.{}.weight'.format(i)].permute(0,2,1).unsqueeze(1))
        

In [None]:
train_model(train_loader, stc_model, every=200, learning_rate = 1e-3, num_epochs = 13, predict_type='STC')

# CNN-multichannel

In [20]:
class CnnMulti(nn.Module):
    def __init__(self, emb_size=200, num_filters=100, num_chunks=100, filter_sizes=(3, 4, 5), dropout=0.5, pretrained=False, conv_static=False, multichannel=False):
        super(CnnMulti, self).__init__()
        if pretrained:
            self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embedding_matrix), freeze=False)
        else:
            self.embedding = nn.Embedding(len(id2token), emb_size, padding_idx = PAD_IDX)
            
        self.multichannel = multichannel
        if multichannel:
            self.embedding_multi = nn.Embedding.from_pretrained(torch.from_numpy(embedding_matrix), freeze=True)
            self.convs = nn.ModuleList(
                [nn.Conv2d(1, num_filters*2, (k, emb_size*2)) for k in filter_sizes])
        else:
            self.convs = nn.ModuleList(
                [nn.Conv2d(1, num_filters, (k, emb_size)) for k in filter_sizes])
            
        if conv_static:
            for conv in self.convs:
                conv.weight.requires_grad = False
        self.dropout = nn.Dropout(dropout)
        if multichannel:
            self.fc = nn.Linear(num_filters*2*len(filter_sizes), 1)
        else:
            self.fc = nn.Linear(num_filters*len(filter_sizes), 1)
 
        self.fc2 = nn.Linear(num_chunks, 1)
        self.num_chunks = num_chunks
        
        
    def chunk_maxpool(self, x, dim, k):
        # maxpool over chunks and shrink size to k
        chunks = torch.chunk(x, chunks=k, dim=dim)
        chunks = [torch.max(ts, dim=dim).values.unsqueeze(dim) for ts in chunks]
        return torch.cat(chunks, dim=dim)
        
        
    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)
        #x = F.max_pool1d(x, x.size(2)).squeeze(2)
        # print('size after conv: ', x.size())
        x = self.chunk_maxpool(x, 2, self.num_chunks)
        return x
    
    def forward(self, x, lengths):
        if self.multichannel:
            embed = torch.cat((self.embedding(x), self.embedding_multi(x)), dim=2)
        else:
            embed = self.embedding(x)
        out = embed.unsqueeze(1)
        # print('size after embed: ', out.size())
        out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1)
        # print('size after concat: ', out.size())
        out = self.dropout(out).permute(0,2,1)
        out = self.fc(out).squeeze(2)
        #print ('size after fc: ', out.size())
        out = self.fc2(out).squeeze(1)
        #print ('output size: ', out.size())
        return out
    
    

### NTC

In [25]:
mul_ntc_dict = torch.load('data/conv_nonstatic_fc_0_5_dropout.pt', map_location=device)
mul_ntc_model = CnnMulti(pretrained=True, conv_static=False, multichannel=True).double().to(device)

for i in range(len(mul_ntc_model.convs)):
    with torch.no_grad():
        new_weight = torch.zeros_like(mul_ntc_model.convs[i].weight, device=device, requires_grad=False)
        new_weight[:100, :, :, :200].copy_(mul_ntc_dict['convs.{}.weight'.format(i)].permute(0,2,1).unsqueeze(1))
        new_weight[100:,:,:, 200:].copy_(mul_ntc_dict['convs.{}.weight'.format(i)].permute(0,2,1).unsqueeze(1))
        mul_ntc_model.convs[i].weight.copy_(new_weight)


In [None]:
train_model(train_loader, mul_ntc_model, every=200, learning_rate = 1e-3, num_epochs = 15, predict_type='mul-NTC')

### STC

In [21]:
mul_stc_dict = torch.load('data/conv_nonstatic_fc_0_5_dropout.pt', map_location=device)
mul_stc_model = CnnMulti(pretrained=True, conv_static=True, multichannel=True).double().to(device)

for i in range(len(mul_stc_model.convs)):
    with torch.no_grad():
        new_weight = torch.zeros_like(mul_stc_model.convs[i].weight, device=device, requires_grad=False)
        new_weight[:100, :, :, :200].copy_(mul_stc_dict['convs.{}.weight'.format(i)].permute(0,2,1).unsqueeze(1))
        new_weight[100:,:,:, 200:].copy_(mul_stc_dict['convs.{}.weight'.format(i)].permute(0,2,1).unsqueeze(1))
        mul_stc_model.convs[i].weight.copy_(new_weight)


In [22]:
train_model(train_loader, mul_stc_model, every=200, learning_rate = 1e-3, num_epochs = 14, predict_type='mul-STC')

Epoch: [1/14], Step: [201/752], Training batch loss: 0.3733088737266363, Validation loss: 0.2857387065887451, Validation correlation: 0.19462321891990322
Epoch: [1/14], Step: [401/752], Training batch loss: 0.2705385034121349, Validation loss: 0.2615768611431122, Validation correlation: 0.26436904207717005
Epoch: [1/14], Step: [601/752], Training batch loss: 0.5526325370363329, Validation loss: 0.18978549540042877, Validation correlation: 0.3127660344128434
Epoch: [2/14], Step: [201/752], Training batch loss: 0.1866009646904164, Validation loss: 0.17762236297130585, Validation correlation: 0.326104090820013
Epoch: [2/14], Step: [401/752], Training batch loss: 0.30348370400488667, Validation loss: 0.24659709632396698, Validation correlation: 0.3855784459445231
Epoch: [2/14], Step: [601/752], Training batch loss: 0.10525024429989582, Validation loss: 0.24576710164546967, Validation correlation: 0.36400320961895144
Epoch: [3/14], Step: [201/752], Training batch loss: 0.2667186560227735, V