## 1. Imports

In [1]:
import wandb
import copy
import torch
import json
import tqdm
import yaml
import pandas as pd
import numpy as np
import seaborn as sns
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sqlalchemy import  create_engine
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from pytorch_datasets import SentimentAnalysisDataset, DatasetType
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
# quickly check model state dict before and after loading from file
# check size of output and shape
# save results to csv after prediction

## 2. Database configuration & Model config

In [2]:
# Reading form config.yaml"
with open("../../config.yaml", "r") as yamlconfig:
    config = yaml.load(yamlconfig, Loader=yaml.FullLoader)

# Create postgres string with db-config
postgres_username = config["db_config"]["postgres_username"]
postgres_password = config["db_config"]["postgres_password"]
postgres_address = config["db_config"]["postgres_address"]
postgres_port = config["db_config"]["postgres_port"]
postgres_dbname = config["db_config"]["postgres_dbname"]

postgres_str = f"postgresql://{postgres_username}:{postgres_password}@{postgres_address}:{postgres_port}/{postgres_dbname}"

# create db connection with sqlalchemy
cnx = create_engine(postgres_str)

In [3]:
model_name_huggingface = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name_huggingface)
huggingface_model = AutoModelForSequenceClassification.from_pretrained(model_name_huggingface, return_dict=True)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 3. Setup model

In [4]:
class SentimentAnalysisModel(torch.nn.Module):
    def __init__(self, pretrained_model):
        super(SentimentAnalysisModel, self).__init__()
        self.pretrained_model = pretrained_model

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.pretrained_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )
        return output

model = SentimentAnalysisModel(huggingface_model)

In [5]:
# load best model
best_model_state = torch.load("../../models/sentiment_analysis/cardiffnlp_twitter-roberta-base-sentiment-latest.pth")

In [6]:
model.load_state_dict(best_model_state["model_state_dict"])

<All keys matched successfully>

In [None]:
# Set model in evaluation mode before prediction
model.eval()

In [32]:
tokens = tokenizer.encode_plus(
    "This stock goes to the moon!",
    add_special_tokens=True,
    max_length=256,
    padding='max_length',
    return_token_type_ids=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

In [33]:
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']
token_type_ids = tokens["token_type_ids"]

In [34]:
outputs = model(input_ids, attention_mask, token_type_ids)

In [35]:
probabilities = F.softmax(outputs.logits, dim=1)

In [36]:
probabilities

tensor([[8.2851e-04, 2.6224e-04, 9.9891e-01]], grad_fn=<SoftmaxBackward0>)

In [39]:
model.training

False

## 4. Loading data from DB

In [8]:
df = pd.read_sql("""
SELECT created_at, post, stock_symbol
FROM r_wallstreetbets_stock_symbols
WHERE 'GME' = ANY(stock_symbol)
ORDER BY 2
""", cnx)

In [None]:
def predict_sentiment(post):

    # Set model in evaluation mode before prediction & disable gradient calculation
    model.eval()

    with torch.no_grad():
        pass

In [9]:
df

Unnamed: 0,created_at,post,stock_symbol
0,2022-04-19 23:08:36,'$AMC and $GME to the moon! Ryan Cohen can now...,[GME]
1,2022-03-18 03:43:36,'Do incredibly stupid things with your money l...,[GME]
2,2021-09-07 05:54:00,'Everyone felt like it was spam'. Really. All ...,[GME]
3,2021-12-31 02:54:19,"'In seeking to answer this question, staff obs...",[GME]
4,2022-01-22 13:54:42,'Injected with Hypeium' - GME Trailer,[GME]
...,...,...,...
144270,2021-05-29 22:12:14,🦧🦧 I am heaviest right now in AMC but I will b...,"[AMC, GME]"
144271,2021-08-25 09:06:33,🧂🧂🧂 only way to lose on GME was to chase crazy...,[GME]
144272,2022-01-07 14:15:34,🧙✨ A Pox upon your Stocks 🧙✨\n\n⸸⛧⛤Summoning G...,"[GME, A]"
144273,2021-07-01 03:34:19,🧱 GME 💎🙌!!!!!!!!,[GME]


In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [14]:
model = torch.load("../../models/sentiment_analysis/cardiffnlp_twitter-xlm-roberta-base-sentiment.pth")

AttributeError: 'dict' object has no attribute 'state_dict'