In [10]:
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer
import torch
import torch.nn as nn

In [8]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

In [76]:
def preprocess_dataset(batch):
    # print(len(batch["text"]))
    input_tensors = []
    mask_tensors = []
    for text in batch["text"]:
        tweet = tokenizer(text, return_tensors="pt", padding="max_length",truncation=True,max_length=256)
        input_ids, attention_mask = tweet["input_ids"], tweet["attention_mask"]
        input_tensors.append(input_ids)
        mask_tensors.append(attention_mask)
    # print(len(batch["text"]))
    input_ids, attention_mask = torch.stack(input_tensors, dim=0), torch.stack(mask_tensors, dim=0)
    # print(input_ids.shape)
    prices = torch.tensor(batch["open"]).unsqueeze(-1).float()
    volumes = torch.tensor(batch["volume"]).unsqueeze(-1).float()
    labels = torch.tensor(batch["close"]).float()

    return {"input_ids": input_ids, "attention_mask": attention_mask, "prices": prices, "volumes": volumes, "labels": labels}

def custom_collate_fn(batch):
    input_ids = [torch.tensor(item["input_ids"]) for item in batch]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in batch]
    prices = [torch.tensor(item["prices"]) for item in batch]
    volumes = [torch.tensor(item["volumes"]) for item in batch]
    labels = [torch.tensor(item["labels"]) for item in batch]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    prices = torch.nn.utils.rnn.pad_sequence(prices, batch_first=True, padding_value=0)
    volumes = torch.nn.utils.rnn.pad_sequence(volumes, batch_first=True, padding_value=0)
    labels = torch.stack(labels)

    return {"input_ids": input_ids, "attention_mask": attention_mask, "prices": prices, "volumes": volumes, "labels": labels}



In [71]:
hf_dataset = load_dataset("json", data_files={"train": "/Users/pan/Documents/course/EECS545/Group/Stock-Market-Prediction/data/tweet_price/aligned_data.json"}, split="train")
print(hf_dataset)
# stat = []
# for i in range(10):
#     stat.append(len(" ".join(hf_dataset[i]["text"])))
# print(max(stat))
# print(min(stat))


Found cached dataset json (/Users/pan/.cache/huggingface/datasets/json/default-b509911a125022b9/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


Dataset({
    features: ['name', 'date', 'open', 'high', 'low', 'close', 'volume', 'adj_close', 'text', 'label'],
    num_rows: 19059
})


In [77]:
hf_dataset = hf_dataset.map(preprocess_dataset, batched=True)
print(hf_dataset)
print(hf_dataset[0]["text"])

                                                                   

Dataset({
    features: ['name', 'date', 'open', 'high', 'low', 'close', 'volume', 'adj_close', 'text', 'label', 'input_ids', 'attention_mask', 'prices', 'volumes', 'labels'],
    num_rows: 19059
})
['weekly dow stocks trend $ dis $ wmt $ hd $ gs $ v $ intc $ ibm $ utx $ vz $ unh $ t $ msft $ axp $ jpm $ mrk $ csco $ ko $ cvx @ URL $ vz - a new year means time for new dogs of the dow -> URL stock stocks stockaction', '$ vz - why t-mobile bought verizons spectrum -> URL stock stocks stockaction', '$ vz us stocks-wall st ends flat on caution before u . s . jobs data URL video accumulationdistribution in excel URL doubletop $ v $ vz $ wmt $ xom $ sco $ sqqq $ agq $ qid $ fas $ tna $ iau $ gld $ ewa', '$ vz - messaging app market still has room for massive growth -> URL stock stocks stockaction $ vz will t-mobile really take over the wireless industry ? URL']




In [78]:
train_dataloader = DataLoader(hf_dataset, batch_size=2, shuffle=True,collate_fn=custom_collate_fn)
for ex in train_dataloader:
    print(ex["input_ids"].shape)
    print(ex["attention_mask"].shape)
    print(ex["prices"].shape)
    print(ex["volumes"].shape)
    print(ex["labels"].shape)
    break

torch.Size([2, 4, 256])
torch.Size([2, 4, 256])
torch.Size([2, 4, 1])
torch.Size([2, 4, 1])
torch.Size([2, 4])


In [15]:
# test the encode and decode sequence: done
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
def tokenize_function(examples):
    lower = [preprocess(x) for x in examples["text"]]
    result = tokenizer(lower)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result
result = tokenize_function(dataset["train"][1:3])

NameError: name 'dataset' is not defined

In [16]:
result

{'input_ids': [[0, 9713, 3263, 1215, 47955, 4819, 9, 2350, 18, 3748, 5182, 4246, 68, 10, 102, 2911, 68, 885, 3892, 282, 68, 213, 2154, 68, 784, 571, 506, 721, 2050, 3964, 1258, 1735, 17066, 3923, 6031, 27586, 918, 111, 111, 33000, 2], [0, 405, 705, 40, 2501, 15162, 33000, 68, 10, 102, 2911, 15162, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'word_ids': [[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 11, 11, 12, 13, 13, 13, 14, 15, 15, 16, 17, 17, 17, 18, 18, 18, 18, 19, 20, 20, 20, 20, 20, 21, 22, 23, None], [None, 0, 0, 1, 2, 3, 4, 5, 6, 6, 6, 7, None]]}

In [28]:
print(tokenizer.decode(result["input_ids"][0]))
dataset["train"][1]

<s>rt @user summary of yesterday's webcast featuring $ aapl $ wynn $ goog $ lgf tradereducation options hedgingstrategies - - URL</s>


{'text': ['rt',
  'AT_USER',
  'summary',
  'of',
  "yesterday's",
  'webcast',
  'featuring',
  '$',
  'aapl',
  '$',
  'wynn',
  '$',
  'goog',
  '$',
  'lgf',
  'tradereducation',
  'options',
  'hedgingstrategies',
  '-',
  '-',
  'URL'],
 'created_at': 'Wed Jan 01 03:29:29 +0000 2014',
 'user_id_str': '1933063572'}