In [1]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

In [2]:
import pandas as pd
import numpy as np

In [3]:
from tqdm import tqdm_notebook as tqdm

In [4]:
from torch.nn.utils.rnn import pad_sequence

In [5]:
from torch.utils.data import Dataset

In [6]:
from torch.utils.data import DataLoader

In [7]:
import torch.nn as nn

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
from transformers.optimization import AdamW

In [10]:
stock_data = pd.read_csv("stock_data.csv")

# Tokenize

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [12]:
stock_data['tokens'] = stock_data['Text'].apply(lambda x: ['[CLS]'] + tokenizer.tokenize(x) + ['[SEP]'])

In [13]:
stock_data['token_ids'] = stock_data['tokens'].apply(lambda x: tokenizer.convert_tokens_to_ids(x))

In [14]:
stock_data['Sentiment'] = stock_data['Sentiment'].replace(-1, 0)

# Split Data

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(stock_data[['token_ids']], stock_data[['Sentiment']], test_size=0.33, random_state=42)

# Load Data

In [19]:
class SentenceDataset(Dataset):   
    def __init__(self, ids, labels):
        self.instances = []
        for ids_i, label in zip(ids, labels):
            self.instances.append({"ids": ids_i, "labels": label})  
    def __len__(self):
        return len(self.instances)

    def __getitem__(self, idx):
        sample = self.instances[idx]

        return sample

In [20]:
train_dataset = SentenceDataset(X_train['token_ids'], Y_train['Sentiment'])
test_dataset = SentenceDataset(X_test['token_ids'], Y_test['Sentiment'])

In [21]:
def collate(batch):
    padded_ids = pad_sequence([torch.tensor(instance['ids']) for instance in batch], batch_first=True)
    padded_ids = padded_ids#.to(device)
    
    labels = torch.stack([torch.tensor([instance['labels']]) for instance in batch])
    labels = labels#.to(device)
    return {'ids': padded_ids, 'labels': labels}

In [22]:
dataloader_train = DataLoader(train_dataset, batch_size=8, shuffle = True, collate_fn = collate)

# Build Model

In [23]:
class baseline(nn.Module):
    
    def __init__(self):        
        super(baseline, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.linear = nn.Linear(768, 1)

    def forward(self, batch):
        # batch['ids'] = (batch_size, sent_len)
        # batch['segment_ids'] = (batch_size, sent_len)
        # batch['mask_ids'] = = (batch_size, sent_len)
        # output = (batch_size, 1)
        hidden_state, pooler_output = self.bert(batch['ids'])
        linear_output = self.linear(pooler_output)
        
        return linear_output

    def loss(self, batch):
       
        loss_fn = nn.BCEWithLogitsLoss()
        output = self.forward(batch)
        target = batch['labels'].float().to(device)
        
        return loss_fn(output, target)
    
    def _predict(self, batch):
        
        output = self.forward(batch)
        scores = torch.sigmoid(output)
        scores = scores.cpu().numpy()[:,0].tolist()
        
        return scores

In [24]:
baseline_model = baseline()

In [25]:
def optim(nn, num_epochs, lr):
    param_optimizer = list(nn.bert.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    num_epochs = num_epochs
    num_train_optimization_steps = len(dataloader_train) * num_epochs
    optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
    
    '''
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                     num_warmup_steps=int(
                                                         num_train_optimization_steps * 0.1),
                                                     num_training_steps=num_train_optimization_steps)
                                                     '''
    return optimizer#, scheduler

In [28]:
def train(network, data, num_epochs, lr):
    
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = optim(network, num_epochs, lr)
    
    #sp_golds = validation_data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist()
    
    for current_epoch in range(num_epochs):
        network.train()
        running_loss = 0.0
        for batch in tqdm(data):
            optimizer.zero_grad()
            current_output = network(batch)
            current_target = batch['labels'].to(dtype=torch.float)
            current_loss = loss_fn(current_output, current_target)

            current_loss.backward()
            torch.nn.utils.clip_grad_norm_(network.parameters(), 1.0)
            optimizer.step()
            running_loss += current_loss.item()
            
        #learning_rate_scalar = scheduler.get_lr()[0]
        #print('lr = %f' % learning_rate_scalar)
        avg_loss = running_loss/len(data)
        print('epoch %d train_loss: %.3f' % (current_epoch, avg_loss))

In [29]:
train(baseline_model, dataloader_train, 1, 0.00002)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


HBox(children=(FloatProgress(value=0.0, max=485.0), HTML(value='')))


epoch 0 train_loss: 0.575


In [36]:
torch.save(baseline_model.state_dict(), "model_1")