In [1]:
import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from transformers import BertTokenizer, BertModel
from transformers import get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
import sys 
import os
sys.path.append(os.path.abspath("/Users/13793/Desktop/aas/stock"))
import processer as processer


In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
MAX_LEN = 50

In [3]:
#BERT processing function
def prep(data):
    input_ids = []
    attention_masks = []
    
    for i in data:
        encoding = tokenizer.encode_plus(
                text=i, 
                add_special_tokens=True, # adds special chars [CLS] and [SEP] to encoding 
                padding='max_length', # pad the tweets with 0s to fit max length
                max_length = MAX_LEN, # assign max length
                truncation=True, 
                return_tensors="pt", 
                return_attention_mask=True )

        # add the encodings to the list
        input_ids.append(encoding.get('input_ids'))
        attention_masks.append(encoding.get('attention_mask'))
    
    # return the lists as tensors
    input_ids = torch.concat(input_ids)
    attention_masks = torch.concat(attention_masks)
    
    return input_ids, attention_masks

In [4]:
# Define the Bert NLP Classifier
class BertClassifier(nn.Module):
    def __init__(self, freeze=False):
        super(BertClassifier, self).__init__()
        input_layer = 768
        hidden_layer = 40
        output_layer = 2

        # Use the pretrained Bert model for first section of NN
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Define a final layer to attach to the Bert model for custom classification
        self.classifier = nn.Sequential(
            nn.Linear(input_layer, hidden_layer), 
            nn.ReLU(), 
            nn.Linear(hidden_layer, output_layer))

        # Freeze the model from updating
        if freeze:
            for i in self.bert.parameters():
                i.requires_grad = False
        
    # Return classification from Bert model 
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask)
        layer = outputs[0][:, 0, :]
        logits = self.classifier(layer)

        return logits

In [5]:
# Check if GPU is available and assign device 
if torch.cuda.is_available():       
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
model = BertClassifier(freeze=False)
model.load_state_dict(torch.load('stock_sentiment_model.pt'))
model.to(device)


BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [6]:
# Get the list of stock data to convert
files = os.listdir('data/')

# for each stock files
for x in range(len(files)):
    # open the excel file on the Stream sheet
    stock = pd.read_excel('data/'+files[x] + '/export_dashboard_' + files[x], sheet_name='Stream')

    # Assign the ticker name as a column
    stock['Ticker'] = files[x].split('_')[0]
    
    # Convert string date times to datetime
    stock['Date'] = pd.to_datetime(stock['Date'])
    stock['Hour'] = stock['Hour'].apply(lambda t: pd.Timedelta(hours=int(t[:2]), minutes=int(t[3:])))
    stock['Datetime'] = stock['Date'] + stock['Hour']

    # Rename column that holds the tweets content
    stock.rename(columns = {'Tweet content':'Text'}, inplace = True)

    # Pre process the tweet content
    stock = processer.Preprocess_Tweets(stock)

    # Remove excess columns
    stock = stock[['Tweet Id', 'Ticker', 'Datetime', 'Text', 'Favs', 'RTs', 'Followers', 'Following', 'Is a RT']]
    
    # Fill NAs in Favs, RTs, Followers and Following with 0
    stock = stock.fillna(0)

    # Encode processed tweets for Bert NLP model
    stock_inputs, stock_masks = prep(stock['Text'].values)

    batch_size = 16
    # Put stock data in PyTorch dataloader for processing 
    stock_data = TensorDataset(stock_inputs, stock_masks)
    stock_sampler = RandomSampler(stock_data)
    stock_dataloader = DataLoader(stock_data, sampler=stock_sampler, batch_size=batch_size)

    # Assign model to evaluate 
    model.eval()

    predictions = []

    # For each batch
    for batch in stock_dataloader:
        # Get encoded inputs and masks 
        batch_inputs, batch_masks = batch

        # Send variables to device (GPU if available)
        batch_inputs = batch_inputs.to(device)
        batch_masks = batch_masks.to(device)

        # Predict classes with Bert for given inputs 
        with torch.no_grad():
            logits = model(batch_inputs, batch_masks)

        # Convert predictions to 0s and 1s
        preds = torch.argmax(logits, dim=1).flatten()
        predictions.append(preds)

    # Combine all batch predictions
    predictions = torch.cat(predictions).cpu().numpy()
    
    # Add predictions to stock dataframe
    stock['Sentiment'] = predictions
    
    # save predictions as new csv
    stock.to_csv('data/'+files[x] +'/stock_data_sentiment.csv', index=False)
    
    # Show stock names as they are completed 
    print(files[x].split('_')[0], '- completed')

  pat = re.compile(pat, flags=flags)
  attn_output = torch.nn.functional.scaled_dot_product_attention(


aal - completed


Unnamed: 0,Tweet Id,Ticker,Datetime,Text,Favs,RTs,Followers,Following,Is a RT,Sentiment
0,743011665663295491,aal,2016-06-15 09:26:00,american airlines group inc aal novo nordisk n...,0.0,0.0,2039.0,108.0,False,1
1,742994700563558400,aal,2016-06-15 08:18:00,yesterdays top fallers anglo american aal anto...,0.0,0.0,1792.0,80.0,False,0
2,742991573181423618,aal,2016-06-15 08:06:00,saquickideas 5 large cap stocks lowest enterpr...,0.0,0.0,1589.0,1376.0,False,1
3,742991250899513345,aal,2016-06-15 08:04:00,jpmorgan chase co reiterates underweight ratin...,0.0,0.0,771.0,8.0,False,0
4,742990282380173313,aal,2016-06-15 08:01:00,dal aal 5 large cap stocks lowest enterprise m...,0.0,0.0,788.0,6.0,False,1
...,...,...,...,...,...,...,...,...,...,...
6502,707914551908552705,aal,2016-03-10 13:02:00,u vie 4 flights,1.0,0.0,248.0,156.0,False,1
6503,707908040348016640,aal,2016-03-10 12:36:00,new stock pick psnp picking up serious attenti...,0.0,0.0,12543.0,0.0,False,0
6504,707907421889495040,aal,2016-03-10 12:34:00,hot new stock alert psnp things really poised ...,0.0,0.0,19713.0,0.0,False,1
6505,707906875690512385,aal,2016-03-10 12:32:00,mineral exploration company heating up fast se...,0.0,0.0,12556.0,0.0,False,1
