## 1. Imports

In [33]:
import pandas as pd
import numpy as np
import yaml
import torch
import torch.nn.functional as F
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from sqlalchemy import create_engine

In [2]:
# https://huggingface.co/docs/transformers/main_classes/pipelines
classifier = pipeline("sentiment-analysis")
res = classifier("I love you")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [3]:
res

[{'label': 'POSITIVE', 'score': 0.9998656511306763}]

Other version with specific model decleration

In [8]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [13]:
tokens = tokenizer.tokenize("I love you my dear!")
token_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = tokenizer("I love you my dear!")

In [15]:
tokens

['i', 'love', 'you', 'my', 'dear', '!']

In [17]:
token_ids

[1045, 2293, 2017, 2026, 6203, 999]

In [19]:
input_ids

{'input_ids': [101, 1045, 2293, 2017, 2026, 6203, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [20]:
batch = tokenizer(["hello world"], padding=True, truncation=True, max_length=512, return_tensors="pt")

In [23]:
batch

{'input_ids': tensor([[ 101, 7592, 2088,  102]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [27]:
with torch.no_grad():
    outputs = model(**batch)
    print(outputs)
    predictions = F.softmax(outputs.logits, dim=1)
    print(predictions)
    labels = torch.argmax(predictions, dim=1)
    print(labels)
    labels = [model.config.id2label[label_id] for label_id in labels.tolist()]
    print(labels)

SequenceClassifierOutput(loss=None, logits=tensor([[-3.9943,  4.3083]]), hidden_states=None, attentions=None)
tensor([[2.4782e-04, 9.9975e-01]])
tensor([1])
['POSITIVE']


In [28]:
# save pretrained model
save_dir = "testsave"

tokenizer.save_pretrained(save_dir)
model.save_pretrained(save_dir)


In [29]:
# load saved model
tokenizer = AutoTokenizer.from_pretrained(save_dir)
model = AutoModelForSequenceClassification.from_pretrained(save_dir)

In [32]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

## Finetuning

In [32]:
# https://huggingface.co/transformers/v3.2.0/custom_datasets.html

# 1. Prepare Dataset

# 2. Load a pretrained Tokenizer and call it with dataset -> encoding

# 3. Build Pytorch Dataset with encodings

# 4. Load pretrained Model

# 5. a) load Trainer and train it -> abstracts a lot of stuff
# 5. b) use native Pytorch training pipeline



## Testload of reddit data from db

In [34]:
# Reading form config.yaml"
with open("../../config.yaml", "r") as yamlconfig:
    config = yaml.load(yamlconfig, Loader=yaml.FullLoader)

# Create postgres string with db-config
postgres_username = config["db_config"]["postgres_username"]
postgres_password = config["db_config"]["postgres_password"]
postgres_address = config["db_config"]["postgres_address"]
postgres_port = config["db_config"]["postgres_port"]
postgres_dbname = config["db_config"]["postgres_dbname"]

postgres_str = f"postgresql://{postgres_username}:{postgres_password}@{postgres_address}:{postgres_port}/{postgres_dbname}"

# create db connection with sqlalchemy
cnx = create_engine(postgres_str)

In [41]:
df = pd.read_sql('SELECT * FROM r_wallstreetbets_stock_symbols LIMIT 100', cnx)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            100 non-null    int64         
 1   post          100 non-null    object        
 2   author        100 non-null    object        
 3   created_at    100 non-null    datetime64[ns]
 4   num_up_votes  100 non-null    int64         
 5   type          100 non-null    object        
 6   stock_symbol  100 non-null    object        
dtypes: datetime64[ns](1), int64(2), object(4)
memory usage: 5.6+ KB


In [43]:
df

Unnamed: 0,id,post,author,created_at,num_up_votes,type,stock_symbol
0,6,And here I am preaching the GME gospel at Than...,dulkion,2021-05-01 00:47:46,2,comment,[GME]
1,8,"Oh man, I'd be selling TSLA so fast. Everyone...",Forevergogo,2021-05-01 00:47:43,2,comment,[TSLA]
2,13,"?? BTX was going up all week, mooned Wednesday...",Eattherich1987,2021-05-01 00:47:36,1,comment,[MVIS]
3,20,Every email I get is titled “Last chance!” or ...,JadedJared,2021-05-01 00:47:23,1,comment,[SA]
4,23,Medical is part of STEM right,SnooCauliflowers4003,2021-05-01 00:47:17,1,comment,[STEM]
...,...,...,...,...,...,...,...
95,919,"""Most things mentioned on this sub as a squee...",Humblegiant2552,2021-05-01 03:59:25,1,comment,[GME]
96,922,Thank you. “Selling covered calls makes a lot ...,praetor-maximus,2021-05-01 03:59:22,1,comment,[AMD]
97,923,i said that about BIDU and $262 (about it grav...,Brawldud,2021-05-01 03:59:22,1,comment,[BIDU]
98,929,Been hearing PTON ads on all the podcasts I li...,russcatalano,2021-05-01 03:59:02,-5,comment,[PTON]
