## 1. Imports

In [1]:
import wandb
import copy
import torch
import json
from tqdm import tqdm
import yaml
import pandas as pd
import numpy as np
import seaborn as sns
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sqlalchemy import  create_engine
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from pytorch_datasets import SentimentAnalysisDataset, DatasetType
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
# check size of output and shape
# save results to csv after prediction

## 2. Database configuration & Model config

In [2]:
# Reading form config.yaml"
with open("../../config.yaml", "r") as yamlconfig:
    config = yaml.load(yamlconfig, Loader=yaml.FullLoader)

# Create postgres string with db-config
postgres_username = config["db_config"]["postgres_username"]
postgres_password = config["db_config"]["postgres_password"]
postgres_address = config["db_config"]["postgres_address"]
postgres_port = config["db_config"]["postgres_port"]
postgres_dbname = config["db_config"]["postgres_dbname"]

postgres_str = f"postgresql://{postgres_username}:{postgres_password}@{postgres_address}:{postgres_port}/{postgres_dbname}"

# create db connection with sqlalchemy
cnx = create_engine(postgres_str)

In [3]:
model_name_huggingface = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name_huggingface)
huggingface_model = AutoModelForSequenceClassification.from_pretrained(model_name_huggingface, return_dict=True)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 3. Setup model

In [4]:
class SentimentAnalysisModel(torch.nn.Module):
    def __init__(self, pretrained_model):
        super(SentimentAnalysisModel, self).__init__()
        self.pretrained_model = pretrained_model

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.pretrained_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )
        return output

model = SentimentAnalysisModel(huggingface_model)

In [5]:
# load best model
best_model_state = torch.load("../../models/sentiment_analysis/cardiffnlp_twitter-roberta-base-sentiment-latest.pth")

In [6]:
model.load_state_dict(best_model_state["model_state_dict"])

<All keys matched successfully>

In [7]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [8]:
model.to(device)

SentimentAnalysisModel(
  (pretrained_model): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
             

## 4. Loading data from DB & predicting sentiment

In [9]:
df = pd.read_sql("""
SELECT created_at, post, stock_symbol
FROM r_wallstreetbets_stock_symbols
WHERE 'AMC' = ANY(stock_symbol)
ORDER BY 2
""", cnx)

In [10]:
# Function to predict sentiment scores for a single post
def predict_sentiment(df):

    tokens = tokenizer.encode_plus(
        df["post"],
        add_special_tokens=True,
        max_length=256,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)
    token_type_ids = tokens["token_type_ids"].to(device)

    # Set model in evaluation mode before prediction
    model.eval()

    # Disable gradient calculations
    with torch.no_grad():
        outputs = model(input_ids, attention_mask, token_type_ids)
        probabilities = F.softmax(outputs.logits, dim=1)

    return pd.Series([probabilities[0].cpu().numpy()[0], probabilities[0].cpu().numpy()[1], probabilities[0].cpu().numpy()[2]], index=["Negative", "Neutral", "Positive"])

In [11]:
tqdm.pandas()

In [12]:
df[['Negative','Neutral','Positive']] = df.progress_apply(lambda x: predict_sentiment(x), axis=1)

100%|██████████| 125876/125876 [21:56<00:00, 95.58it/s]


In [13]:
df.to_csv("../../data/sentiment_analysis_predictions/AMC.csv", index = False)

In [14]:
df_test = pd.read_csv("../../data/sentiment_analysis_predictions/AMC.csv")

In [15]:
df_test

Unnamed: 0,created_at,post,stock_symbol,Negative,Neutral,Positive
0,2021-06-01 13:43:45,"''Today, we announced ***selling*** 8.5 millio...",['AMC'],0.076889,0.921566,0.001545
1,2021-06-02 21:20:19,''What's the exit strategy for AMC &amp; BB ?'...,['AMC'],0.001541,0.996695,0.001764
2,2021-06-02 21:04:52,''What's the exit strategy for AMC ?'',['AMC'],0.004683,0.994740,0.000578
3,2021-05-27 19:03:00,''What's the exit strategy for AMC ?'' &amp;#x...,['AMC'],0.001751,0.996623,0.001626
4,2021-06-02 21:07:46,''What's the exit strategy for AMC ?'' I have ...,['AMC'],0.002554,0.996377,0.001069
...,...,...,...,...,...,...
125871,2021-07-22 02:21:30,🦥🦥🦥💎 AMC,['AMC'],0.001373,0.900091,0.098537
125872,2021-06-07 11:14:43,🦧 LoL -Some on Wall Street try options trade t...,['AMC'],0.067654,0.929924,0.002422
125873,2021-05-29 22:12:14,🦧🦧 I am heaviest right now in AMC but I will b...,"['AMC', 'GME']",0.121967,0.816447,0.061587
125874,2021-06-03 23:00:11,🦧🦧🦧🦧 AMC,['AMC'],0.001693,0.989049,0.009259


In [16]:
df_test

Unnamed: 0,created_at,post,stock_symbol,Negative,Neutral,Positive
0,2021-06-01 13:43:45,"''Today, we announced ***selling*** 8.5 millio...",['AMC'],0.076889,0.921566,0.001545
1,2021-06-02 21:20:19,''What's the exit strategy for AMC &amp; BB ?'...,['AMC'],0.001541,0.996695,0.001764
2,2021-06-02 21:04:52,''What's the exit strategy for AMC ?'',['AMC'],0.004683,0.994740,0.000578
3,2021-05-27 19:03:00,''What's the exit strategy for AMC ?'' &amp;#x...,['AMC'],0.001751,0.996623,0.001626
4,2021-06-02 21:07:46,''What's the exit strategy for AMC ?'' I have ...,['AMC'],0.002554,0.996377,0.001069
...,...,...,...,...,...,...
125871,2021-07-22 02:21:30,🦥🦥🦥💎 AMC,['AMC'],0.001373,0.900091,0.098537
125872,2021-06-07 11:14:43,🦧 LoL -Some on Wall Street try options trade t...,['AMC'],0.067654,0.929924,0.002422
125873,2021-05-29 22:12:14,🦧🦧 I am heaviest right now in AMC but I will b...,"['AMC', 'GME']",0.121967,0.816447,0.061587
125874,2021-06-03 23:00:11,🦧🦧🦧🦧 AMC,['AMC'],0.001693,0.989049,0.009259
