Step 2: Establish a dataset with both stock and sentiment

In [2]:
import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from transformers import BertTokenizer, BertModel
from transformers import get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
import sys 
import os
sys.path.append(os.path.abspath("/Users/13793/Desktop/aas/stock"))
import yfinance as yf
import matplotlib.pyplot as plt

# Add the project root directory to the Python path
import subprocess
result = subprocess.check_output(["git", "rev-parse", "--show-toplevel"], universal_newlines=True)
git_root = result.strip()
sys.path.append(git_root)

import processer as processer

Prepare Sentiment Data

In [3]:
#Set up for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
MAX_LEN = 50

In [4]:
#BERT processing function setup
def prep(data):
    input_ids = []
    attention_masks = []
    
    for i in data:
        encoding = tokenizer.encode_plus(
                text=i, 
                add_special_tokens=True, 
                padding='max_length', 
                max_length = MAX_LEN, 
                truncation=True, 
                return_tensors="pt", 
                return_attention_mask=True )

        input_ids.append(encoding.get('input_ids'))
        attention_masks.append(encoding.get('attention_mask'))
    
    input_ids = torch.concat(input_ids)
    attention_masks = torch.concat(attention_masks)
    
    return input_ids, attention_masks

In [5]:
# Define the Bert NLP Classifier
class BertClassifier(nn.Module):
    def __init__(self, freeze=False):
        super(BertClassifier, self).__init__()
        input_layer = 768
        hidden_layer = 40
        output_layer = 2

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.classifier = nn.Sequential(
            nn.Linear(input_layer, hidden_layer), 
            nn.ReLU(), 
            nn.Linear(hidden_layer, output_layer))

        if freeze:
            for i in self.bert.parameters():
                i.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask)
        layer = outputs[0][:, 0, :]
        logits = self.classifier(layer)

        return logits

In [6]:
# Check if GPU is available and assign device 
if torch.cuda.is_available():       
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
model = BertClassifier(freeze=False)
model.load_state_dict(torch.load('stock_sentiment_model.pt'))
model.to(device)

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [7]:
stocks = pd.read_csv("hf://datasets/nbettencourt/SC454k/sc454k_clean.csv")

'HTTPSConnectionPool(host='cdn-lfs-us-1.huggingface.co', port=443): Read timed out.' thrown while requesting GET https://huggingface.co/datasets/nbettencourt/SC454k/resolve/main/sc454k_clean.csv
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: be620d7a-5fbb-4911-99a2-d1a82a80009e)')' thrown while requesting GET https://huggingface.co/datasets/nbettencourt/SC454k/resolve/main/sc454k_clean.csv
Retrying in 2s [Retry 2/5].


In [8]:
substrings = ['AAPL', 'COST', 'AMZN', 'DIS', 'GOOGL', 'JPM', 'LLY', 'MSFT', 'META', 'NFLX', 'NVDA', 'TSLA']
pattern = '|'.join(substrings)

stock = stocks[stocks['Article'].str.contains(pattern, case=False, na=False)]
stock


Unnamed: 0.1,Unnamed: 0,Symbol,Security,URL,Date,RelatedStocksList,Article,Title,articleType,Publication,Author
1,1,AAN,"The Aaron's Company, Inc.",https://www.nasdaq.com/press-release/the-aaron...,"Apr 01, 2022 10:13 AM ET",,"ATLANTA, April 1, 2022 PRNewswire/ -- The Aar...",THE AARON'S COMPANY COMPLETES ACQUISITION OF B...,Press Release,Symbol Press Release,Symbol
2,2,AAN,"The Aaron's Company, Inc.",https://www.nasdaq.com/articles/why-is-aarons-...,"Apr 02, 2023 02:24 AM ET",Technology,It has been about a month since the last earni...,Why Is Aaron's (AAN) Down 26.3% Since Last Ear...,News,Zacks,Zacks Equity Research
3,3,AAN,"The Aaron's Company, Inc.",https://www.nasdaq.com/articles/3-retail-stock...,"Apr 04, 2019 06:08 AM ET",TGT|Markets|BBY|WMT|AMZN,"The S&P 500 has climbed nearly 15% this year, ...",3 Retail Stocks to Buy Right Now for Value & I...,News,Zacks,
4,4,AAN,"The Aaron's Company, Inc.",https://www.nasdaq.com/articles/aarons-aan-bra...,"Apr 04, 2022 03:50 PM ET",Stocks|COLM|GIL|DLA,"**The Aaron's Company, Inc.** [AAN](https://ww...",Aaron's (AAN) BrandsMart Buyout to Boost Sales...,News,Zacks,Zacks Equity Research
5,5,AAN,"The Aaron's Company, Inc.",https://www.nasdaq.com/articles/aarons-company...,"Apr 04, 2024 06:54 PM ET",Stocks,"Aaron's Company, Inc. (AAN) closed the latest ...","Aaron's Company, Inc. (AAN) Stock Moves -0.27%...",News,Zacks,Zacks Equity Research
...,...,...,...,...,...,...,...,...,...,...,...
453926,453926,ZYME,Zymeworks Inc.,https://www.nasdaq.com/articles/shareholders-h...,"Sep 17, 2021 11:33 AM ET",Public Companies,"The worst result, after buying shares in a com...",Shareholders have faith in loss-making Zymewor...,News,Simply Wall St,Simply Wall St
453927,453927,ZYME,Zymeworks Inc.,https://www.nasdaq.com/articles/best-biotech-s...,"Sep 18, 2021 06:33 AM ET",PFE|Stocks|MRNA|ABCL|JNJ,**4 Top Biotech Stocks To Watch This Upcoming ...,Best Biotech Stocks To Buy Now? 4 For Your Wat...,News,StockMarket.com,Brett David
453928,453928,ZYME,Zymeworks Inc.,https://www.nasdaq.com/press-release/independe...,"Sep 26, 2022 04:17 PM ET",,- Institutional Shareholder Services joins Gla...,Independent Proxy Advisory Firm ISS Recommends...,Press Release,Symbol Press Release,Symbol
453929,453929,ZYME,Zymeworks Inc.,https://www.nasdaq.com/articles/cancer-results...,"Sep 27, 2019 01:54 PM ET",Markets|AMGN|RHHBY,"Several times a year, leading cancer researche...",Cancer Results to Watch at This Weekend’s Euro...,News,The Motley Fool,David Haen


In [9]:
# Get the list of stock data to convert
stock['Datetime'] = pd.to_datetime(stock['Date'])

    # Rename column that holds the tweets content to aligh with previous model
stock.rename(columns = {'Article':'Text'}, inplace = True)

    # Pre process the tweet content
stock = processer.Preprocess_Tweets(stock)

    # Remove excess columns
stock = stock[['Text', 'Symbol', 'Datetime']]
    
stock = stock.fillna(0)

stock_inputs, stock_masks = prep(stock['Text'].values)

batch_size = 16
stock_data = TensorDataset(stock_inputs, stock_masks)
stock_sampler = RandomSampler(stock_data)
stock_dataloader = DataLoader(stock_data, sampler=stock_sampler, batch_size=batch_size)

model.eval()

predictions = []
    # Hidden Feature: Use the ratio of Logits to classify if it the pos/neg is strong or weak, discarded due to no significant improvement
    #log1 = []
    #log2 = []
for batch in stock_dataloader:
        # Get encoded inputs and masks 
    batch_inputs, batch_masks = batch

        # Send variables to device (GPU if available)
    batch_inputs = batch_inputs.to(device)
    batch_masks = batch_masks.to(device)

        # Predict classes with Bert for given inputs 
    with torch.no_grad():
        logits = model(batch_inputs, batch_masks)

        # Convert predictions to 0s and 1s
    preds = torch.argmax(logits, dim=1).flatten()
    predictions.append(preds)
        #log1.append(logits[:,0])
        #log2.append(logits[:,1])
    # Combine all batch predictions
predictions = torch.cat(predictions).cpu().numpy()
    #log1 = torch.cat(log1).cpu().numpy()
    #log2 = torch.cat(log2).cpu().numpy()

    # Add predictions to stock dataframe
stock['Sentiment'] = predictions
    #stock['log1'] = log1
    #stock['log2'] = log2
    
    # save predictions as new csv    
    # Show each stock completed 

  stock['Datetime'] = pd.to_datetime(stock['Date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock['Datetime'] = pd.to_datetime(stock['Date'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock.rename(columns = {'Article':'Text'}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Text'] = data['Text'].str.lower()
A value is trying to be set on a copy of a slice from a DataF

NameError: name 'files' is not defined

In [12]:
stock
stock.to_csv('stock_data_sentiment.csv', index=False)


Merge with stock data