## 1. Imports

In [1]:
import pandas as pd
import numpy as np
import yaml
import copy
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from enum import Enum

## 2. Database configuration

In [2]:
# Reading form config.yaml"
with open("../../config.yaml", "r") as yamlconfig:
    config = yaml.load(yamlconfig, Loader=yaml.FullLoader)

# Create postgres string with db-config
postgres_username = config["db_config"]["postgres_username"]
postgres_password = config["db_config"]["postgres_password"]
postgres_address = config["db_config"]["postgres_address"]
postgres_port = config["db_config"]["postgres_port"]
postgres_dbname = config["db_config"]["postgres_dbname"]

postgres_str = f"postgresql://{postgres_username}:{postgres_password}@{postgres_address}:{postgres_port}/{postgres_dbname}"

# create db connection with sqlalchemy
cnx = create_engine(postgres_str)

In [3]:
df = pd.read_sql('SELECT * FROM r_wallstreetbets_stock_symbols LIMIT 100', cnx)

In [4]:
df['label'] = np.random.choice([1,2,3], df.shape[0])

In [5]:
df = df[["post", "label"]]

In [6]:
df["label"].value_counts(normalize=True)*100

3    35.0
1    33.0
2    32.0
Name: label, dtype: float64

In [7]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
#model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

## 3. Building Pytorch Dataset

In [8]:
# Make simple Enum for code clarity
class DatasetType(Enum):
    TRAIN = 1
    TEST = 2
    # VAL = 3

In [9]:
class SentimentAnalysisDataset(Dataset):

    def __init__(self,
                 df: pd.DataFrame,
                 tokenizer: AutoTokenizer,
                 max_token_len: int = 512,
                 stratify_column_name: str = "label",
                 frac_train: float = 0.8,
                 frac_test: float = 0.2,
                 random_state = 42):

        self.df = df
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
        self.stratify_column_name = stratify_column_name
        self.frac_train = frac_train
        self.frac_test = frac_test
        self.random_state = random_state

        # Initialize dataset and labels as None
        self.dataset = None
        self.labels = None

        # Stratified train_test_split
        self.X_train, self.X_test, self.y_train, self.y_test = self.stratified_train_test_split()

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        # Set post and label
        post: str = str(self.dataset.iloc[idx])
        label = self.labels.iloc[idx]

        # Tokenize post
        tokens = self.tokenizer.encode_plus(
            post,
            add_special_tokens=True,
            max_length=self.max_token_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': tokens['input_ids'].flatten(),
            'attention_mask': tokens['attention_mask'].flatten(),
            'token_type_ids': tokens["token_type_ids"].flatten(),
            'labels': torch.FloatTensor(label)
        }

    def stratified_train_test_split(self):
        X = self.df # Contains all columns.
        y = self.df[[self.stratify_column_name]] # Dataframe of just the column on which to stratify.

        # Split original dataframe into train and temp dataframes.
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=(1.0 - self.frac_train),random_state=self.random_state)

        return X_train, X_test, y_train, y_test

    def set_fold(self, type: str):
        # It's important to call this method before using the dataset
        if type == DatasetType.TRAIN:
            self.dataset, self.labels = self.X_train, self.y_train
        if type == DatasetType.TEST:
            self.dataset, self.labels = self.X_test, self.y_test
        return self


In [10]:
sentiment_analysis_dataset = SentimentAnalysisDataset(
    df = df,
    tokenizer = tokenizer
)

In [11]:
%%time
train = copy.deepcopy(sentiment_analysis_dataset).set_fold(DatasetType.TRAIN)
test = copy.deepcopy(sentiment_analysis_dataset).set_fold(DatasetType.TEST)

CPU times: total: 31.2 ms
Wall time: 27.1 ms


In [None]:
for i in train:
    print(tokenizer.encode_plus(i.__getitem__(0)[0], add_special_tokens=True, return_tensors="pt", padding=512))
    print("-"*35)
    print(i.__getitem__(0)[1])
    break

In [14]:
for i in train:
    print(i.__getitem__(0))
    break

KeyError: 0

In [16]:
train.__getitem__(1)

{'input_ids': tensor([  101,  2695,  5292,  2748,  3830,  1016,  2171,  1024,  2403,  1010,
         26718, 18863,  1024,  4874,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   