In [2]:
import gzip
from tqdm import tqdm

import numpy as np
import os
import random
from collections import defaultdict
from typing import *

In [3]:
import torch
from transformers import BertConfig,BertModel,BertForSequenceClassification,BertTokenizer
GPU = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
from torch.utils.data import Dataset,DataLoader

In [5]:
SUBSET_SENTENCE_CNT = 100000
VALID_SENTENCE_LEN_IN_CHAR_THS = 40

In [6]:
def sample_sub_spoiler_set(sentence_cnt:int,seed:int=42)->List[dict]:
    random.seed(seed)
    path = f"./sampled_datasets/review_spoiler_dataset_l{sentence_cnt}_s{seed}.txt"
    if os.path.exists(path):
        res = list()
        with open(path) as fin:
            lines = fin.readlines()
            for l in lines:
                res.append(eval(l))
        return res
    random.seed(seed)
    res = list()
    viewed_sample_cnt=0
    with open("./goodreads_reviews_spoiler.json/goodreads_reviews_spoiler.json",encoding="utf-8") as fin:
        lines = fin.readlines()
        for line in tqdm(lines):
            line = line.replace("true","True")
            line = line.replace("false","False")
            datum = eval(line)
            book_id = datum['book_id']
            rating = datum['rating']
            for label,sentence in datum['review_sentences']:
                if len(sentence)<VALID_SENTENCE_LEN_IN_CHAR_THS:
                    continue
                viewed_sample_cnt+=1
                if len(res)<sentence_cnt:
                    d = dict()
                    d["label"]=label
                    d["review_sentence"]=sentence
                    d["book_id"]=book_id
                    d['rating']=rating
                    res.append(d)
                else:
                    i = random.randint(0,viewed_sample_cnt-1)
                    if i<sentence_cnt:
                        d = dict()
                        d["label"]=label
                        d["review_sentence"]=sentence
                        d["book_id"]=book_id
                        d['rating']=rating
                        res[i]=d
    with open(path,"w+") as fout:
        for datum in res:
            fout.write(repr(datum)+"\n")
    return res
spoiler_dataset = sample_sub_spoiler_set(SUBSET_SENTENCE_CNT)

In [24]:
spoiler_dataset[0]

{'label': 1,
 'review_sentence': 'The magus said, "I think if you took the time to look, you might see that over the space of a year you turned into the greatest folk hero Eddis has ever known."',
 'book_id': '40158',
 'rating': 5}

In [7]:
model_type = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_type)
model = BertForSequenceClassification.from_pretrained(model_type)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
sentences = ["I love you. Springboot is a java web framework.","What is this?"]
input_dict = tokenizer(sentences,padding=True,truncation=True,max_length=256,return_tensors="pt")
model(**input_dict)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0940, -0.3300],
        [-0.1193, -0.5823]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [25]:
class ListDataset(Dataset):
    def __init__(self,xs,ys) -> None:
        super().__init__()
        if len(xs)!=len(ys):
            raise ValueError
        self.xs = xs
        self.ys = ys
    
    def __len__(self):
        return len(self.xs)
    
    def __getitem__(self, index) -> Any:
        return self.xs[index],self.ys[index]

def get_spoiler_dataset(spoiler_dataset_raw:List[dict])->Dataset:
    xs = list()
    ys = list()
    for datum in spoiler_dataset_raw:
        xs.append(datum['review_sentence'])
        ys.append(datum['label'])
    return ListDataset(xs,ys)

In [26]:
spoiler_dataset_processed = get_spoiler_dataset(spoiler_dataset)

In [29]:
data_loader = DataLoader(spoiler_dataset_processed,batch_size=2,shuffle=True)

for b_x,b_y in data_loader:
    print(b_x,b_y)
    break

('And it\'s taking us out of the world."', 'I wish there were in AI character.') tensor([0, 0])
