In [1]:
import gzip
from tqdm import tqdm


def parse(path):
    res = list()
    g = gzip.open(path, "r",encoding="utf-8")
    for l in g:
        res.append(eval(l))
    return res

def parse_raw(path):
    res = list()
    with open(path,encoding="utf-8") as fin:
        lines = fin.readlines()
        for line in tqdm(lines):
            line = line.replace("true","True")
            line = line.replace("false","False")
            res.append(eval(line))
    return res

def split_raw_file(path):
    # res = list()
    with open(path,encoding="utf-8") as fin:
        lines = fin.readlines()
    for i in range(10):
        dot_pos = path.rfind(".")
        save_file_path = path[:dot_pos]+str(i)+path[dot_pos:]
        with open(save_file_path,"w+",encoding="utf-8") as fout:
            fout.writelines(lines[(len(lines)*i)//10:(len(lines)*(i+1))//10])

In [2]:
def log_text_to_file(text:str)->None:
    with open("./output/log.txt","a") as fout:
        fout.write(text+"\n")

In [3]:
import numpy as np
import os
import random
from collections import defaultdict
from typing import *
from predeal_dataset import *

In [4]:
import torch
from transformers import BertConfig,BertModel,BertForSequenceClassification,BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from torch.utils.data import Dataset,DataLoader,random_split

In [6]:
BERT_EPOCH = 4
BERT_BATCH_SIZE=5
BERT_MAX_LEN = 384
BERT_LR = 1e-6
BERT_L2 = 0.0001

In [7]:
GPU = "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
# total_sentence_cnt = 0
# with open("./goodreads_reviews_spoiler.json/goodreads_reviews_spoiler.json",encoding="utf-8") as fin:
#     lines = fin.readlines()
#     for line in tqdm(lines):
#         line = line.replace("true","True")
#         line = line.replace("false","False")
#         datum = eval(line)
#         for label,sentence in datum['review_sentences']:
#             if len(sentence)>=VALID_SENTENCE_LEN_IN_CHAR_THS:
#                 total_sentence_cnt+=1

In [9]:
# split_raw_file("./goodreads_books.json/goodreads_books.json")
# split_raw_file("./goodreads_reviews_spoiler.json/goodreads_reviews_spoiler.json")

In [10]:
spoiler_dataset = sample_sub_spoiler_set(SUBSET_SENTENCE_CNT)

In [11]:
pos_cnt = sum(d["label"] for d in spoiler_dataset)

In [12]:
pos_cnt

3330

In [13]:
len(spoiler_dataset)

50000

In [14]:
eval

<function eval(source, globals=None, locals=None, /)>

In [15]:
def get_correspondent_books_info(spoiler_dataset:List[dict],save_path:str)->List[dict]:
    if os.path.exists(save_path):
        res = list()
        with open(save_path,encoding="utf-8") as fin:
            lines = fin.readlines()
            for l in lines:
                res.append(eval(l))
        return res
    books_required = set()
    for d in spoiler_dataset:
        books_required.add(d["book_id"])
    res = list()
    for i in range(10):
        with open(f"./goodreads_books.json/goodreads_books{i}.json",encoding="utf-8") as fin:
            lines = fin.readlines()
            for line in tqdm(lines):
                d = eval(line)
                book_id = d["book_id"]
                if not book_id in books_required:
                    continue
                description = d["description"]
                title = d["title"]
                res.append({"book_id":book_id,"description":description,"title":title})
        print(f"altogether {len(res)} books has been found after searching {i+1} book info split files")
    try:
        with open(save_path,"w+",encoding="utf-8") as fout:
            for datum in res:
                fout.write(repr(datum)+"\n")
    except BaseException:
        pass
    return res            

books_info = get_correspondent_books_info(spoiler_dataset,"./sampled_datasets/s42_spoilers_correspondent_book.txt")

In [16]:
books_required = set()
for d in spoiler_dataset:
    books_required.add(d["book_id"])
len(books_required)

16067

In [17]:
model_type = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_type)

In [18]:
statistic_arr = [0]*101
for i in tqdm(range(len(spoiler_dataset[:len(spoiler_dataset)//10]))):
    l = len(tokenizer(spoiler_dataset[i]['review_sentence'])['input_ids'])
    if l>=1000:
        l=1000
    statistic_arr[l//10]+=1

100%|██████████| 5000/5000 [00:26<00:00, 188.78it/s]


In [19]:
statistic_arr2 = [0]*101
for i in tqdm(range(len(spoiler_dataset[:len(spoiler_dataset)//10]))):
    l = len(spoiler_dataset[i]['review_sentence'])
    if l>=5000:
        l=5000
    statistic_arr2[l//50]+=1

100%|██████████| 5000/5000 [00:00<00:00, 636600.19it/s]


In [20]:
# class ListDataset(Dataset):
#     def __init__(self,xs,ys) -> None:
#         super().__init__()
#         if len(xs)!=len(ys):
#             raise ValueError
#         self.xs = xs
#         self.ys = ys
    
#     def __len__(self):
#         return len(self.xs)
    
#     def __getitem__(self, index) -> Any:
#         return self.xs[index],self.ys[index]
    
class ListDataset(Dataset):
    def __init__(self,*lists) -> None:
        super().__init__()
        if len(lists)==0:
            raise ValueError("Expecting at least one list")
        l = len(lists[0])
        for i,li in enumerate(lists):
            if not isinstance(li,(list,tuple,np.ndarray,torch.Tensor)):
                raise ValueError(f"expecting input to be list,tuple,numpy-array or torch's tensor, actually get {type(li)} at {i}-th argument")
            if len(li)!=l:
                raise ValueError(f"length of {i}-th argument is {len(li)}, length of 0-th argument is {l}, they don't match")
        self.lists = lists
        self.l = l
    
    def __len__(self):
        return self.l
    
    def __getitem__(self, index) -> Any:
        return tuple(map(lambda l:l[index],self.lists))


def get_spoiler_dataset(spoiler_dataset_raw:List[dict])->Dataset:
    xs = list()
    ys = list()
    for datum in spoiler_dataset_raw:
        xs.append(datum['review_sentence'])
        ys.append(datum['label'])
    return ListDataset(xs,ys)

In [21]:
spoiler_dataset_processed = get_spoiler_dataset(spoiler_dataset)

In [22]:

train_dataset, valid_dataset, test_dataset = random_split(spoiler_dataset_processed, [TRAIN_SET_CNT, VALID_SET_CNT, TEST_SET_CNT])

In [23]:
len(valid_dataset)

5000

In [24]:
def to_device(d:dict,device=GPU)->dict:
    for k in d:
        d[k]=d[k].to(device)
    return d

In [25]:


weights = [pos_cnt/SUBSET_SENTENCE_CNT,1-pos_cnt/SUBSET_SENTENCE_CNT]
weights = torch.tensor(weights, dtype=torch.float).to(GPU)
loss_func = torch.nn.CrossEntropyLoss(weights)

In [26]:
# data_loader = DataLoader(train_dataset,batch_size=3,shuffle=True)
# for batch in data_loader:
#     print(batch)
#     break

In [27]:
valid_set_loader = DataLoader(valid_dataset,batch_size=BERT_BATCH_SIZE,shuffle=False)

In [28]:
def get_performance_info(y_actual,y_predict):
    y_actual = np.array(y_actual)
    y_predict = np.array(y_predict)
    y_actual = y_actual.reshape((-1,))
    y_predict = y_predict.reshape((-1,))
    TP = np.sum((y_actual == 1) & (y_predict == 1))
    FP = np.sum((y_actual == 0) & (y_predict == 1))
    TN = np.sum((y_actual == 0) & (y_predict == 0))
    FN = np.sum((y_actual == 1) & (y_predict == 0))
    TPR = TP / (TP + FN)
    FPR = FP / (FP + TN)
    TNR = TN / (TN + FP)
    FNR = FN / (TP + FN)
    BER = 1 - (0.5 * (TPR + TNR))
    accu = np.sum(y_actual==y_predict)/len(y_actual)
    return accu,TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER

def evaluate(model,dataloader):
    y_pred = list()
    y_label = list()
    with torch.no_grad():
        for b_x,b_y in tqdm(dataloader):
            input_dict = tokenizer(b_x,padding=True,truncation=True,max_length=BERT_MAX_LEN,return_tensors="pt")
            output = (model(**to_device(input_dict,GPU)).logits).to("cpu").detach()
            y_label.extend(b_y.numpy().tolist())
            y_pred.extend(torch.argmax(output,dim=1).numpy().tolist())
    return get_performance_info(y_label,y_pred)

e = 2.718281828

def get_best_ths_with_ber(pred_prop_with_label): 
    pred_prop_with_label.sort(reverse=True)
    valid_set_pos_cnt = sum(tup[1] for tup in pred_prop_with_label)
    valid_set_neg_cnt = len(pred_prop_with_label)-valid_set_pos_cnt
    best_ths = 1.0
    best_ber = 0.5
    curr_false_positive = 0
    curr_false_negative = valid_set_pos_cnt
    for (prob,label) in pred_prop_with_label:
        ths = prob-0.00001
        if label==1:
            curr_false_negative-=1
        else:
            curr_false_positive+=1
        ber = 0.5*(curr_false_negative/valid_set_pos_cnt+curr_false_positive/valid_set_neg_cnt)
        if ber<best_ber:
            best_ber = ber
            best_ths = ths
    return best_ths,best_ber

def evaluate_dynamic_prob_ths(model,dataloader):
    e = 2.718281828
    y_pred_logits = list()
    y_label = list()
    with torch.no_grad():
        for b_x,b_y in tqdm(dataloader):
            input_dict = tokenizer(b_x,padding=True,truncation=True,max_length=BERT_MAX_LEN,return_tensors="pt")
            output = (model(**to_device(input_dict,GPU)).logits).to("cpu").detach()
            y_label.extend(b_y.numpy().tolist())
            y_pred_logits.extend(output.numpy().tolist())
    y_pos_prob_pred = list(map(lambda logits:e**logits[1]/(e**logits[0]+e**logits[1]),y_pred_logits))
    pred_prob_with_label = list(zip(y_pos_prob_pred,y_label))
    judging_ths,ber = get_best_ths_with_ber(pred_prob_with_label)
    y_pred = list(int(p>judging_ths) for p in y_pos_prob_pred)
    return *get_performance_info(y_label,y_pred),judging_ths


In [29]:
# evaluate_dynamic_prob_ths(model,valid_set_loader)

In [30]:
model = BertForSequenceClassification.from_pretrained(model_type)
model.to(GPU)
optimizer = torch.optim.Adam(model.parameters(),BERT_LR,weight_decay=BERT_L2)
optimizer_warmup = torch.optim.Adam(model.classifier.parameters(),0.01,weight_decay=0.01)

data_loader = DataLoader(train_dataset,batch_size=BERT_BATCH_SIZE,shuffle=True)

log_text_to_file("start training BertForSequenceClassification model , dataset is review_text only")
for e in range(BERT_EPOCH):
    for step,(b_x,b_y) in enumerate(tqdm(data_loader)):
        input_dict = tokenizer(b_x,padding=True,truncation=True,max_length=BERT_MAX_LEN,return_tensors="pt")
        output = model(**to_device(input_dict,GPU)).logits
        # print(output)
        b_y = b_y.to(GPU)
        loss = loss_func(output,b_y)
        if step>=100 or e>=1:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        else:
            optimizer_warmup.zero_grad()
            loss.backward()
            optimizer_warmup.step()
    accu,TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER,ths = evaluate_dynamic_prob_ths(model,valid_set_loader)
    print(f"ber after epoch {e+1}: {BER}")
    msg = "    %7s%7s%7s%7s%7s%7s%7s\n     %.4f %.4f %.4f %.4f %.4f %.4f %.4f "%("accu","ber","tpr","fpr","tnr","fnr","ths",accu,BER,TPR,FPR,TNR,FNR,ths)
    # print(accu,TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER)
    print(msg)
    save_path = f"./output/review_only_bert_e{e}_b{BERT_BATCH_SIZE}_lr{BERT_LR}_l2{BERT_L2}_ber{BER:.4f}"
    torch.save(model.state_dict(),save_path)
    log_text_to_file(f"    ber after epoch {e+1}: {BER}, model params saved to {save_path}, other performance infos:{accu,TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER}")
    log_text_to_file(msg)
    

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  7%|▋         | 590/8000 [03:27<42:11,  2.93it/s] 

In [43]:
model_path = "./output/review_only_bert_e1_b5_lr1e-06_l20.0001_ber0.3214"
model = BertForSequenceClassification.from_pretrained(model_type)
model.load_state_dict(torch.load(model_path))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [47]:
model.to(GPU)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [48]:
y_label = list()
y_logits = list()
with torch.no_grad():
    for b_x,b_y in tqdm(DataLoader(valid_dataset,batch_size=BERT_BATCH_SIZE,shuffle=False)):
        input_dict = tokenizer(b_x,padding=True,truncation=True,max_length=BERT_MAX_LEN,return_tensors="pt")
        output = model(**to_device(input_dict,GPU)).logits
        y_label.extend(b_y.numpy().tolist())
        y_logits.extend(output.to("cpu").detach().numpy().tolist())

100%|██████████| 1000/1000 [02:00<00:00,  8.30it/s]


In [55]:
e = 2.718281828
pred_prop_with_label = [(e**logit_pos/(e**logit_neg+e**logit_pos),lbl) for (logit_neg,logit_pos),lbl in zip(y_logits,y_label)]

In [59]:
pred_prop_with_label.sort(reverse=True)

In [70]:
valid_set_pos_cnt = sum(tup[1] for tup in valid_dataset)
valid_set_neg_cnt = VALID_SET_CNT - valid_set_pos_cnt

In [75]:
get_best_ths_with_ber(pred_prop_with_label)

(0.16590920771243706, 0.23056044209890364)

In [66]:
pred_prop_with_label[520:540]

[(0.4346359008445251, 0),
 (0.43437236011218483, 0),
 (0.4343054224677632, 1),
 (0.43396178141716973, 1),
 (0.433760607269941, 1),
 (0.43355051661465943, 0),
 (0.43293000668913073, 0),
 (0.4327360277277486, 0),
 (0.43263807166233326, 0),
 (0.43262990135057205, 0),
 (0.4323402543622263, 0),
 (0.4322489788072485, 0),
 (0.43189199179049115, 0),
 (0.4311957995221293, 0),
 (0.43095367614642016, 0),
 (0.43048090113385756, 0),
 (0.42984434749414746, 0),
 (0.42977580631168005, 0),
 (0.4295807477828543, 0),
 (0.4292543736454375, 0)]

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [158]:
cut_sentence_puncts = [ch for ch in string.punctuation if ch not in "'()[]{}"]
def cut_sub_sentences(sentence:str)->List[str]:
    sentence = sentence.replace("\n"," ").replace("\t"," ")
    res:List[str] = list()
    res.append(sentence)
    for pun in cut_sentence_puncts:
        res_new = list()
        for sub_sen in res:
            res_new.extend(sub_sen.split(pun))
        res = res_new
    res = list(filter(lambda string:len(string)>0,map(lambda x:x.strip(),res)))
    return res

def fetch_longest_k_sub_sentences(sentence:str,k:int=1)->List[str]:
    sentences = cut_sub_sentences(sentence)
    if len(sentences)<=k:
        return sentences
    sentences = list(enumerate(sentences))
    sentences.sort(key=lambda tup:len(tup[1]),reverse=True)
    res = sentences[:k]
    res.sort()
    return list(map(lambda tup:tup[1],res))

def connect_title_and_desc(title:str,description:str,k:int=1,*args,**kwargs)->str:
    res = title.strip()
    if len(res)==0:
        return ""
    if not res[-1] in cut_sentence_puncts:
        res+="."
    if k==0:
        return res
    if description is None:
        return res
    if len(description)==0:
        return res
    res+=" "
    longest_sub_sen = fetch_longest_k_sub_sentences(description,k)
    for desc_sub_sen in longest_sub_sen[:-1]:
        res+=desc_sub_sen+", "
    res+=longest_sub_sen[-1]+"."
    return res

In [157]:
# cut_sub_sentences(books_info[0]["description"])
connect_title_and_desc(**books_info[7])

IndexError: list index out of range

In [159]:
book_info_dict:Dict[str,str] = dict()
for book_info in tqdm(books_info):
    book_info_dict[book_info['book_id']]=connect_title_and_desc(**book_info)

100%|██████████| 16067/16067 [00:01<00:00, 11321.13it/s]


In [165]:
spoiler_dataset[0]

{'label': 0,
 'book_id': '24473763',
 'rating': 4,
 'review_sentence': 'Can we please have more nice guys in romance fantasy fiction? I am so sick and tired of Alpha assholes. Seriously, they are a grade A yawn and I\'m not frankly into the love turn hate trope. Some books do it well but for the most part, the majority of them are cringeworthy where the Hero should be reported for Abuse rather than be swoon over. So when a book touts a nice respectful guy as the hero, I\'m all ears. And this book DELIVERS. It effing delivers a swoon-worthy believable romance with a nice guy who RESPECTS and CHERISHES his woman. Plus he\'s also hot when he get mad at other people for stepping on his woman. The romance was written so well and brings to life the phrase "falling in love". Yeah, I\'m into that. Gimme more. It does have it flaws like the plot is basically non-existent but who cares, it\'s really about these two. Can\'t wait for book 2!!!! '}

In [166]:
def get_spoiler_dataset_with_book_desc(spoiler_dataset_raw:List[dict],seperator:str="[SEP]")->Dataset:
    xs = list()
    ys = list()
    for datum in spoiler_dataset_raw:
        xs.append(f"{book_info_dict[datum['book_id']]} {seperator} {datum['review_sentence']}")
        ys.append(datum['label'])
    return ListDataset(xs,ys)

spoiler_dataset_processed_with_book_desc = get_spoiler_dataset_with_book_desc(spoiler_dataset)

In [167]:
train_dataset_wd, valid_dataset_wd, test_dataset_wd = random_split(spoiler_dataset_processed, [TRAIN_SET_CNT, VALID_SET_CNT, TEST_SET_CNT])

In [168]:
valid_set_loader_wd = DataLoader(valid_dataset_wd,batch_size=BERT_BATCH_SIZE,shuffle=False)

In [172]:
model_wd = BertForSequenceClassification.from_pretrained(model_type)
model_wd.to(GPU)
optimizer = torch.optim.Adam(model_wd.parameters(),BERT_LR,weight_decay=BERT_L2)
optimizer_warmup = torch.optim.Adam(model_wd.classifier.parameters(),0.01,weight_decay=0.01)

data_loader_wd = DataLoader(train_dataset_wd,batch_size=BERT_BATCH_SIZE,shuffle=True)

log_text_to_file("start training BertForSequenceClassification model , dataset is review_text_wd")
for e in range(1,BERT_EPOCH):
    for step,(b_x,b_y) in enumerate(tqdm(data_loader_wd)):
        input_dict = tokenizer(b_x,padding=True,truncation=True,max_length=BERT_MAX_LEN,return_tensors="pt")
        output = model_wd(**to_device(input_dict,GPU)).logits
        # print(output)
        b_y = b_y.to(GPU)
        loss = loss_func(output,b_y)
        if step>=100 or e>=1:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        else:
            optimizer_warmup.zero_grad()
            loss.backward()
            optimizer_warmup.step()
    accu,TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER,ths = evaluate_dynamic_prob_ths(model_wd,valid_set_loader_wd)
    print(f"ber after epoch {e+1}: {BER}")
    msg = "    %7s%7s%7s%7s%7s%7s%7s\n     %.4f %.4f %.4f %.4f %.4f %.4f %.4f "%("accu","ber","tpr","fpr","tnr","fnr","ths",accu,BER,TPR,FPR,TNR,FNR,ths)
    # print(accu,TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER)
    print(msg)
    save_path = f"./output/review_wd_bert_e{e}_b{BERT_BATCH_SIZE}_lr{BERT_LR}_l2{BERT_L2}_ber{BER:.4f}"
    torch.save(model_wd.state_dict(),save_path)
    log_text_to_file(f"    ber after epoch {e+1}: {BER}, model params saved to {save_path}, other performance infos:{accu,TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER}")
    log_text_to_file(msg)

100%|██████████| 8000/8000 [38:04<00:00,  3.50it/s]
100%|██████████| 1000/1000 [01:46<00:00,  9.43it/s]


ber after epoch 2: 0.22667584196392432
       accu    ber    tpr    fpr    tnr    fnr    ths
    0.7484 0.2267 0.8023 0.2556 0.7444 0.1977 0.2148 


100%|██████████| 8000/8000 [38:02<00:00,  3.51it/s]
100%|██████████| 1000/1000 [01:45<00:00,  9.48it/s]


ber after epoch 3: 0.22890046137288156
       accu    ber    tpr    fpr    tnr    fnr    ths
    0.7344 0.2289 0.8138 0.2716 0.7284 0.1862 0.1416 


  7%|▋         | 596/8000 [02:52<35:37,  3.46it/s]


KeyboardInterrupt: 

In [171]:
# accu,TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER,ths = evaluate_dynamic_prob_ths(model_wd,valid_set_loader_wd)
# print(f"ber after epoch {e+1}: {BER}")
# msg = "    %7s%7s%7s%7s%7s%7s%7s\n    %.4f %.4f %.4f %.4f %.4f %.4f %.4f "%("accu","ber","tpr","fpr","tnr","fnr","ths",accu,BER,TPR,FPR,TNR,FNR,ths)
#     # print(accu,TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER)
# print(msg)
# save_path = f"./output/review_wd_bert_e{e}_b{BERT_BATCH_SIZE}_lr{BERT_LR}_l2{BERT_L2}_ber{BER:.4f}"
# torch.save(model_wd.state_dict(),save_path)
# log_text_to_file(f"    ber after epoch {e+1}: {BER}, model params saved to {save_path}, other performance infos:{accu,TP,FP,TN,FN,TPR, FPR, TNR, FNR, BER}")
# log_text_to_file(msg)

100%|██████████| 1000/1000 [01:45<00:00,  9.46it/s]


ber after epoch 3.718281828: 0.24025427566182578
       accu    ber    tpr    fpr    tnr    fnr    ths
    0.733000.240250.790830.271340.728660.209170.28194
