In [7]:
maindir = "/mnt/md0/data/avo727/PromptTuning"
datadir = f"{maindir}/CWNdata"
preddir = f"{maindir}/model_predictions"
datafile = f"{datadir}/refined_dot_type_5.0.csv"

In [8]:
import pandas as pd 
df = pd.read_csv(datafile)
df.head()
# [CLS]instance[SEP]label[SEP]

Unnamed: 0,type_class,eng word,word,pos,instance,src,dot_type_2,dot_type_1,label,zh_type_class,zh_dot_type,zh_dot_type_2,zh_dot_gloss,is_2choice,is_one_ans
0,"act,proposition",allegation,聲稱,nom,法國希望美國相信他們的<聲稱>，至少他們也能從伊拉克石油業受益。,CWN2,proposition,proposition,1,"行為,命題",命題,命題,邏輯學上指表達判斷的語句。通常以直陳語句或假定句表達。,True,True
1,"act,proposition",allegation,聲稱,nom,新澤西肯尼迪健康系統傳染病項目主管駁斥香港文匯報﹕「這是完全錯誤的<聲稱>。去年二月死去的女...,CWN2,proposition,proposition,1,"行為,命題",命題,命題,邏輯學上指表達判斷的語句。通常以直陳語句或假定句表達。,True,True
2,"act,proposition",allegation,理論,Na,你這個<理論>很奇怪，是不是朋友，你自已要分清楚這跟借不借錢沒什麼關係吧！,CWN2,proposition,proposition,1,"行為,命題",命題,命題,邏輯學上指表達判斷的語句。通常以直陳語句或假定句表達。,True,True
3,"act,proposition",allegation,理論,Na,我真的很喜歡在言語上和人爭辯，腦子裡自然而然湧出數百個理由支持自己的論點，有時候還會加點無厘...,CWN2,act,act,0,"行為,命題",行為,行為,基於個人的意志而具體表現於外的舉止動作。,True,True
4,"act,proposition",allegation,理論,Na,目前躁鬱症的成因有許多<理論>，其中以生物學病理病因較受重視。,CWN2,proposition,proposition,1,"行為,命題",命題,命題,邏輯學上指表達判斷的語句。通常以直陳語句或假定句表達。,True,True


In [None]:
type_class = set()
for x in df['zh_type_class']:
    for y in x.split(','):
        type_class.add(y)
print(type_class)

{'有形的', '表演', '人類', '資訊', '物質', '過程', '狀態', '命題', '聲音', '音樂', '機構', '作者', '價值', '作品', '結果', '行為', '事件', '液體', '屬性'}


In [None]:
zh_type_class =  tuple(type_class)
import pickle 
filepath = f'{datadir}/dot_type_lexicon/zh_type_classes.pkl'
with open (filepath, 'wb') as f:
    pickle.dump(zh_type_class, f)

In [None]:
reverse_classdict = dict()
for i, x in enumerate(zh_type_class):
    reverse_classdict[x] = i

In [None]:
def make_class_selector(xes):
    xes = xes.split(',')
    return [[reverse_classdict[x]] for x in xes]
df['class_selector'] = df['zh_type_class'].apply(lambda x:make_class_selector(x))

## Pytorch datasets

In [None]:
!pip -q install Datasets
!pip -q install transformers

In [None]:
import os
import gc
import copy
import numpy as np
import transformers
from transformers import BertTokenizerFast,BertForSequenceClassification, BertModel, BertConfig

from datasets import Dataset, load_metric
import datasets

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/263k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624 [00:00<?, ?B/s]

In [None]:
config = {
    'max_len': 512,
    'numchoice':2,
}

In [None]:
def preprocess_function(examples):
    # prompt[CLS]instance[SEP]label[SEP]
    numchoice = config['numchoice']
    LEN = len(examples["instance"])
    max_length = config['max_len'] # 預留n_tokens的位置
    # Repeat each first sentence four times to go with the four possibilities of second sentences.
    contexts = [[context] * numchoice for context in examples["instance"]]
    candidates = [[f'{cand}' for cand in 
                 tpclass.split(',')] for word, tpclass in zip(examples['word'], examples['zh_type_class'])]
    # Flatten everything
    first_sentences = sum(contexts, [])
    second_sentences = sum(candidates, [])
    # Tokenize
    tokenized_examples = tokenizer(first_sentences, 
                                   second_sentences,
                                   padding = False, 
                                   truncation=True,
                                   return_token_type_ids=True,)
                                   #add_special_tokens = True) # default to True
    # Un-flatten
    return {k: [v[i:i+numchoice] for i in range(0, len(v), numchoice)] for k, v in tokenized_examples.items()}

In [None]:
df_ = df[df["is_2choice"] == True]
TESTSIZE = 0.2
dataset = Dataset.from_pandas(df_)
trainset, testset = dataset.train_test_split(test_size=TESTSIZE).values()
final_dataset = datasets.DatasetDict({"train":trainset,"test":testset})

In [None]:
idx = 111
examples = final_dataset["train"][:300]
features = preprocess_function(examples)
numchoice = config['numchoice']
[tokenizer.decode(features["input_ids"][idx][i]) for i in range(numchoice)]

['[CLS] 二 十 五 歲 時 讀 < 柏 拉 圖 > ， 與 五 十 二 歲 時 讀 < 柏 拉 圖 > ， 感 受 一 定 不 同 。 [SEP] 作 者 [SEP]',
 '[CLS] 二 十 五 歲 時 讀 < 柏 拉 圖 > ， 與 五 十 二 歲 時 讀 < 柏 拉 圖 > ， 感 受 一 定 不 同 。 [SEP] 作 品 [SEP]']

## Encoding & saving

In [None]:
encoded_dataset = final_dataset.map(preprocess_function, batched = True)
# PT for prompt-tuning
fpath = f"{datadir}/PT2_encoded_dataset.pkl"
with open(fpath , 'wb') as f:
      pickle.dump(encoded_dataset, f)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]