## 1. Imports

In [1]:
import pandas as pd
import numpy as np
import yaml
import copy
import torch
import json
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from sqlalchemy import create_engine
from pytorch_datasets import SentimentAnalysisDataset, DatasetType

In [2]:
model_name = "ProsusAI/finbert"

## 2. Database configuration & Model config

In [3]:
# Reading form config.yaml"
with open("../../config.yaml", "r") as yamlconfig:
    config = yaml.load(yamlconfig, Loader=yaml.FullLoader)

# Create postgres string with db-config
postgres_username = config["db_config"]["postgres_username"]
postgres_password = config["db_config"]["postgres_password"]
postgres_address = config["db_config"]["postgres_address"]
postgres_port = config["db_config"]["postgres_port"]
postgres_dbname = config["db_config"]["postgres_dbname"]

postgres_str = f"postgresql://{postgres_username}:{postgres_password}@{postgres_address}:{postgres_port}/{postgres_dbname}"

# create db connection with sqlalchemy
cnx = create_engine(postgres_str)

In [4]:
# Load json file with hyperparams of each model
with open('hyperparams.json') as file:
    hyper_params = json.load(file)

In [5]:
# Set up Hyper parameters for model training
LR: float = hyper_params[model_name]["lr"]
OPTIMIZER: str = hyper_params[model_name]["lr"]
EPOCHS: int = hyper_params[model_name]["epochs"]
BATCH_SIZE: int = hyper_params[model_name]["batch_size"]
DROPOUT: float = hyper_params[model_name]["dropout"]

## 3. Dataframe preperations

In [6]:
df = pd.read_excel("../../data/external/annotations_wsb.xlsx", sheet_name="annotations_scale")
df.dropna(inplace=True)
df.drop(columns=["ben", "dliden", "doss", "jmo", "jmo2", "id"], inplace=True)
df.rename(columns={"Own annotation": "label"}, inplace=True)
#dummies = pd.get_dummies(df['label'])
#df = pd.merge(
#    left=df,
#    right=dummies,
#    left_index=True,
#    right_index=True,
#)
#df.rename(columns={0.0: "Negative", 1.0: "Neutral", 2.0: "Positive"}, inplace=True)

In [7]:
df

Unnamed: 0,post,label
0,"\n \nGentlemen, start your boners:\n\nBlackBe...",2.0
1,#It has come to my attention that $PLTR Nation...,2.0
2,#SNDL and TLRY for smooking weed on the moon 🚬...,2.0
3,$63k in on WKHS calls,2.0
4,$CLF and $MT,1.0
...,...,...
178,Woke up on the floor got up and I looked perfe...,2.0
179,"Yeah, I was going to sell my UWMC for a L, but...",0.0
180,"Yep, it’s cheap right now and we could see it ...",2.0
181,You didn’t know what a hedge fund was before G...,1.0


In [8]:
df["label"].value_counts(normalize=True
                         )

2.0    0.519126
1.0    0.295082
0.0    0.185792
Name: label, dtype: float64

## 4. Model Loading

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [10]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## 3. Building Pytorch Dataset

In [20]:
# Declare generic sentiment analysis dataset without split
sentiment_analysis_dataset = SentimentAnalysisDataset(
    df = df,
    tokenizer = tokenizer
)

In [21]:
# Declare train and test dataset
train_dataset = copy.deepcopy(sentiment_analysis_dataset).set_fold(DatasetType.TRAIN)
test_dataset = copy.deepcopy(sentiment_analysis_dataset).set_fold(DatasetType.TEST)

In [22]:
# Setup train and test Data loaders
train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                batch_size=BATCH_SIZE,
                                                shuffle=True,
                                                num_workers=1,
                                                drop_last=False # maybe change in future
                                                )

test_data_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=test_dataset.__len__(),
                                              shuffle=False,
                                              num_workers=1,
                                              )

In [23]:
train_dataset.__len__()

146

In [24]:
test_dataset.__len__()

37

In [27]:
# Check if train data and test data have correct batch and tensor sizes
"""print('TRAINING DATA:')
for dictionary in train_data_loader:
    print(dictionary)
    break"""

print(' ')
print('TESTING DATA:')
for dictionary in test_data_loader:
    print(dictionary["labels"].size())
    #break

 
TESTING DATA:
torch.Size([37, 3])


## 6. Creating custom Model

In [18]:
# Set device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [19]:
# Set up custom model
class SentimentAnalysisModel(torch.nn.Module):
    def __init__(self, pretrained_model):
        super(SentimentAnalysisModel, self).__init__()
        self.pretrained_model = pretrained_model
        #self.dropout = torch.nn.Dropout(0.3)
        #self.linear = torch.nn.Linear(768, 6)

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.pretrained_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )
        #output_dropout = self.dropout(output.pooler_output)
        #output = self.linear(output_dropout)
        return output

model = SentimentAnalysisModel(model)
model.to(device)

In [20]:
testbatch = next(iter(train_data_loader))
testbatch["attention_mask"]

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [21]:
outputs = model.forward(testbatch["input_ids"], testbatch["attention_mask"], testbatch["token_type_ids"])

In [27]:
outputs.logits

tensor([[-1.2355, -0.5570,  2.5822],
        [-0.6530, -1.0630,  2.1269],
        [-0.6685, -1.1261,  2.3206],
        [-1.1098, -0.5815,  2.4675],
        [-1.0767, -0.5182,  2.4575],
        [ 0.2955, -2.1854,  1.8364],
        [-0.9207, -0.9688,  2.5094],
        [-0.8299, -1.1092,  2.5065],
        [-1.4396, -0.1551,  2.4820],
        [-0.6150, -1.3285,  2.4136],
        [-0.6874, -1.5070,  2.4453],
        [ 0.6629, -2.3456,  1.6007],
        [-0.9725, -0.9465,  2.4561],
        [-0.9639, -1.1104,  2.5736],
        [-0.3281, -1.6247,  2.2820],
        [-1.0668, -0.8023,  2.5481],
        [-1.1487, -0.4460,  2.4123],
        [-1.0023, -0.6711,  2.4849],
        [-1.3020, -0.3616,  2.5146],
        [-0.7385, -1.1610,  2.4565],
        [-0.0478, -2.0138,  2.2783],
        [-0.8641, -1.2772,  2.6673],
        [-0.8386, -0.9250,  2.4675],
        [-0.4728, -1.6496,  2.4456],
        [-0.9547, -0.7532,  2.3358],
        [-1.2025, -0.6110,  2.5365],
        [-0.8917, -0.8225,  2.3144],
 

In [51]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [53]:
tokenizer_output = tokenizer("Stocks rallied and the British pound gained.")
tokenizer_output

{'input_ids': [101, 15768, 24356, 1998, 1996, 2329, 9044, 4227, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [55]:
tokenizer_output = tokenizer.encode_plus(
    "Stocks rallied and the British pound gained.",
    add_special_tokens=True,
    max_length=512,
    padding='max_length',
    return_token_type_ids=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

In [61]:
tokenizer_output["token_type_ids"].flatten()

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [71]:
with torch.no_grad():
    output = model(tokenizer_output["input_ids"], attention_mask = tokenizer_output["attention_mask"], token_type_ids = tokenizer_output["token_type_ids"]).logits
    print(output)

tensor([[ 1.6898, -1.5706, -0.9036]])


In [69]:
from torch.nn.functional import softmax

In [72]:
softmax(output, dim=1)

tensor([[0.8984, 0.0345, 0.0672]])

## 7. Model training

In [29]:
loss = torch.nn.BCEWithLogitsLoss()
input = torch.randn(3, requires_grad=True)
print(input)
target = torch.empty(3).random_(2)
print(target)
output = loss(input, target)
print(output)
output.backward()

tensor([ 0.2881, -1.4403, -0.7470], requires_grad=True)
tensor([1., 0., 1.])
tensor(0.6356, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


## WANDB test

In [26]:
import wandb

wandb.init(project="test-project", entity="hda_sis")

[34m[1mwandb[0m: Currently logged in as: [33mjan_burger[0m ([33mhda_sis[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [27]:
wandb.config = {
    "learning_rate": 0.001,
    "epochs": 100,
    "batch_size": 128
}

In [29]:
wandb.log({"loss": 2.5})