# Download Files

In [None]:
# 因為在Colab上進行運作，所以必須要先將必要的檔案下載至Colab環境中，若在本地端執行可以跳過此區段

In [None]:

! wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1eNfsS5igdw6nKgLeKmX8t7qvGVLXpBc8' -O 'Batch_answers - train_data (no-blank).csv'

# Initialize and Parameters

In [None]:
! pip3 install transformers

import csv
import re
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertModel
from transformers import BertForSequenceClassification

# 斷句字串
sep_sentence_regex = '([^?!,.:;]*[?!,.:;]+)'

"""
檔案名稱
"""
# 訓練資料集
trainingset_name = 'Batch_answers - train_data (no-blank).csv'
testingset_name = 'Batch_answers - test_data(no_label).csv'

# 篩選出要丟進去訓練的 q 和 r 
trainingset_Q_name = 'training_set_Q.csv'
trainingset_R_name = 'training_set_R.csv'

# Bert 參數
bert_pretrained_model = 'bert-base-uncased'
torch_model_output = 'model.dat'

# Util

In [None]:
# 句子拆分
def getSubsentences(paragraph: str, reg: str = "([^?!,.:;]*[?!,.:;]+)") -> list:
  if type(paragraph) != str:
    return []
  sentences = list()
  paragraph = str(paragraph)
  # sentences.append(paragraph)
  if len(paragraph) > 1:
    sentences = re.split(reg, paragraph)
    for i in range(len(sentences)):
      sentences[i] = sentences[i].strip()
    sentences = list(filter(lambda x : len(x.strip()) > 0, sentences))
  else:
    sentences = []
  return sentences

# 句子長度計算
def getWords(sentence: str) -> list:
  words = list()
  words = re.split(" ", sentence)
  words = list(filter(lambda x : len(x.strip()) > 0, words))
  return words


# Preprocessing

In [None]:
# 資料前處理
def preprocessTrainSet(mode):
    if mode == "q":
        writeFile = trainingset_Q_name
        title1 = 1
        title2 = 4
    elif mode == "r":
        writeFile = trainingset_R_name
        title1 = 2
        title2 = 5

    # 將 TrainSet 進行前處理。
    with open(trainingset_name, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        with open(writeFile, 'w', encoding='utf-8', newline='') as outfile:
            writer = csv.writer(outfile)

            for row in reader:
                if row[0] == "id":
                    writer.writerow([row[0], 'sentence', 'sub_sentence'])
                else:
                    # 把雙引號處理掉
                    rowQ = row[title1][1:-1]
                    
                    # 先把所有網址全部刪除
                    spaceSplit = re.split(' ', rowQ)
                    ignoreLIST = []
                    for i in range(len(spaceSplit)):
                        if spaceSplit[i] == "http" and spaceSplit[i+1] == ":" and spaceSplit[i+2][0:2] == "//":
                            if i+3 < len(spaceSplit) and spaceSplit[i+3] in ["...", "?"]:
                                ignoreLIST.append(i+3)
                                ignoreLIST.append(i+4)
                            ignoreLIST.append(i)
                            ignoreLIST.append(i+1)
                            ignoreLIST.append(i+2)
                    rowQ = ""
                    for i in range(len(spaceSplit)):
                        if i not in ignoreLIST:
                            if rowQ != "":
                                rowQ += ' '
                            rowQ += spaceSplit[i]
                    
                    # 斷句
                    # splitrow: LIST[string]
                    splitrow = re.split(sep_sentence_regex, rowQ)

                    # 移除所有因斷句產生的空字串
                    while '' in splitrow:
                        splitrow.remove('')

                    # index
                    i = 0
                    
                    # 前處理: 修正斷句內容
                    realtext = []

                    while i < len(splitrow):
                        # 先去除首尾沒必要的符號
                        while splitrow[i] != '' and splitrow[i][0] in ["`", "-", "(", ")"]:
                            splitrow[i] = splitrow[i][1:]
                            if splitrow[i][0] == " ":
                                splitrow[i] = splitrow[i][1:]
                        while splitrow[i] != '' and splitrow[i][-1] in ["`", "-", "(", ")"]:
                            splitrow[i] = splitrow[i][:-1]
                            if splitrow[i][-1] == " ":
                                splitrow[i] = splitrow[i][:-1]
                        
                        # 0. 空字串
                        if splitrow[i] == '':
                            pass
                        
                        # 1. 把刪節號分開的句子連貫起來
                        elif i+1 < len(splitrow) and len(splitrow[i]) > 1 and splitrow[i][-2] + splitrow[i][-1] == "..":
                            realtext.append(splitrow[i] + " " + splitrow[i+1].strip())
                            i += 1
                            
                        # 2. 小數或超大數
                        elif i+1 < len(splitrow) and splitrow[i][-1] in [".", ","] and splitrow[i+1][0].isdigit():
                            realtext.append(splitrow[i] + splitrow[i+1].strip())
                            i += 1

                        # 3. 重複的標點符號但是有空格隔開
                        elif i+1 < len(splitrow) and splitrow[i+1].strip() in ["?", "!", ":", ";", ".", ","]:
                            addSTR = splitrow[i]
                            while i+1 < len(splitrow) and splitrow[i+1].strip() in ["?", "!", ":", ";", ".", ","]:
                                addSTR += splitrow[i+1]
                                i += 1
                            realtext.append(addSTR)
                        else:
                            realtext.append(splitrow[i])
                        i += 1

                    # 只有一句話
                    # 整句丟進模型訓練
                    if len(realtext) == 1:
                        #writer.writerow([row[0], row[title1], row[title2]])
                        pass 
                    # 兩句話，但整段文本很短
                    # 整段當作答案輸出
                    elif len(realtext) == 2 and len(re.split(' ', realtext[0] + realtext[1])) < 15:
                        pass
                    else:    
                        i = 0
                        ans = ""
                        for i in range(len(realtext)):
                            # 判斷片語或疑問詞
                            # 句子太短
                            if len(re.split(' ', realtext[i])) <= 6:
                                # 結尾是結束符號
                                if realtext[i][-1] in ["?", "!", "."]:
                                    pass
                                # 結尾是停頓符號
                                # 忽略
                                elif realtext[i][-1] in [",", ":", ";"]:
                                    pass
                                # 其他
                                # 先忽略
                                else:
                                    pass
                            # 句子夠長，但結尾是','
                            # 把句子連貫
                            # elif realtext[i][-1] == ",":
                            #     if ans != "":
                            #         ans += " "
                            #     ans += realtext[i].strip()


                            # 把這句納入訓練資料，結束
                            else:
                                tempSTR = "".join(realtext)
                                temp = re.split('([^?!.:;]*[?!.:;]+)', tempSTR)
                                # 移除所有因斷句產生的空字串
                                while '' in temp:
                                    temp.remove('')
                                # 處理太長的文本
                                while True:
                                  len_of_temp = 0
                                  for j in temp:
                                      len_of_temp += len(re.split(" ", j))
                                  if len_of_temp > 300:
                                      delete_str = ""
                                      for k in temp:
                                          if len(k) > len(delete_str):
                                            delete_str = k
                                      temp.remove(delete_str)
                                  else:
                                    break
                                ans = "".join(temp)

                                # 寫入訓練資料
                                writer.writerow([row[0], ans, row[title2][1:-1]])
                                break
            outfile.close()
        f.close()

preprocessTrainSet("q")
preprocessTrainSet("r")

In [None]:
"""
前處理計算出現頻率
"""
def training_set_pre_classification(filename):
  df_train = pd.read_csv(filename)
  header = df_train.iloc[0].keys()
  empty_title = ((df_train[header[0]].isnull()) \
                | (df_train[header[1]].isnull()) \
                | (df_train[header[1]] == '') \
                | (df_train[header[1]] == '0'))
  df_train = df_train[~empty_title]

  # 篩選句子長度
  MAX_LENGTH = 15
  df_train = df_train[~(df_train[header[1]].apply(lambda x : len(getSubsentences(x))) > MAX_LENGTH)]

  # 隨機抽樣建立訓練資料
  SAMPLE_FRAC = 0.6
  df_train = df_train.sample(frac=SAMPLE_FRAC, random_state=5180)

  """
  TrainMap:
  {
    rowID:{
      q:"",
      count:1,
      subq:{
        subq 1:freq,
        subq 2:freq,
        subq 3:freq,
        ...
      }
    }
  }
  """
  # 計算子句出現在q'中的比例
  TrainMap = dict()
  for i in range(len(df_train.index)):
    rowID = df_train.iloc[i][header[0]]
    if not rowID in TrainMap.keys():
      TrainMap[rowID] = dict()
      TrainMap[rowID]['sent'] = df_train.iloc[i][header[1]]
      TrainMap[rowID]['subsent'] = dict()
      TrainMap[rowID]['count'] = 1
      for sub in getSubsentences(TrainMap[rowID]['sent']):
        TrainMap[rowID]["subsent"][sub] = 0
    for sub in getSubsentences(df_train.iloc[i][header[2]]):
      if sub in TrainMap[rowID]['subsent'].keys():
        TrainMap[rowID]['subsent'][sub] += 1
        TrainMap[rowID]['count'] += 1

  for rowID in TrainMap.keys():
    if rowID in TrainMap.keys():
      words = getWords(TrainMap[rowID]['sent'])
      for sub in TrainMap[rowID]['subsent'].keys():
        if len(words) + len(getWords(sub)) > 300:
          TrainMap[rowID]['sent'] = " ".join(words[:50] + words[-50:])
          words = getWords(TrainMap[rowID]['sent'])
        

  # 篩選特定子句數
  # df_train = df_train[(df_train["q'"].apply(lambda x : len(getSubsentences(x))) == 1)]


  # 建立輸出資料
  # id, q, subq, freq
  df2_data = dict()
  #df2_data["ID"] = list()
  df2_data["sent"] = list()
  df2_data["subsent"] = list()
  df2_data["freq"] = list()

  for key in TrainMap.keys():
    total = sum(TrainMap[key]['subsent'].values())
    if total == 0:
      continue
    for subq in TrainMap[key]['subsent'].keys():
      #df2_data["ID"].append(key)
      df2_data["sent"].append(TrainMap[key]["sent"])
      df2_data["subsent"].append(subq)
      freq = float(TrainMap[key]["subsent"][subq]/TrainMap[key]["count"])
      # 出現頻率0-1.0分類到類別0-9
      if freq >= 0.75:
          freq = 5
      elif freq >= 0.50:
          freq = 4
      elif freq >= 0.25:
          freq = 3
      elif freq >= 0.10:
          freq = 2
      elif freq > 0:
          freq = 1
      else:
          freq = 0
      df2_data["freq"].append(freq)

  df2 = pd.DataFrame(df2_data)
  df2.to_csv(filename.split(".")[0] + "_classification.tsv", sep="\t", index=False)  
  print(df2.head(10))

  

training_set_pre_classification(trainingset_Q_name)
training_set_pre_classification(trainingset_R_name)

# Bert fine tune training



In [None]:
# 

In [None]:
"""
實作一個可以用來讀取訓練 / 測試集的 Dataset，這是你需要徹底了解的部分。
此 Dataset 每次將 tsv 裡的一筆成對句子轉換成 BERT 相容的格式，並回傳 3 個 tensors：
- tokens_tensor：兩個句子合併後的索引序列，包含 [CLS] 與 [SEP]
- segments_tensor：可以用來識別兩個句子界限的 binary tensor
- label_tensor：將分類標籤轉換成類別索引的 tensor, 如果是測試集則回傳 None
"""
tokenizer = BertTokenizer.from_pretrained(bert_pretrained_model) 
    
class SentenceSimilarityDataset(Dataset): # 繼承自torch的Dataset
    # 初始化設定
    def __init__(self, mode, filename):
        assert mode in ["train", "test"] # 如果讀入模式不是train, test直接報錯
        self.mode = mode
        self.df = pd.read_csv(filename, sep="\t").fillna("")
        self.len = len(self.df)
        self.tokenizer = tokenizer  # BertTokenizer 小寫英文
    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode == "test":
            text_sentence, text_subs = self.df.iloc[idx, :2].values
            tensor_freq_subs = None
        else:
            text_sentence, text_subs, freq_subs = self.df.iloc[idx, :].values
            tensor_freq_subs = torch.tensor(freq_subs)
        # 建立 BERT 起始子 [CLS] 加入第一個句子並加入分隔符號 [SEP]
        word_input = ["[CLS]"]
        tokens_sentence = self.tokenizer.tokenize(text_sentence)
        word_input += tokens_sentence + ["[SEP]"]
        len_sentence = len(word_input)
        
        # 加入第二個句子並加入分隔符號 [SEP]
        tokens_subs = self.tokenizer.tokenize(text_subs)
        word_input += tokens_subs + ["[SEP]"]
        len_subs = len(word_input) - len_sentence
        
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_input)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
        segments_tensor = torch.tensor([0] * len_sentence + [1] * len_subs, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, tensor_freq_subs)

    def __len__(self):
        return self.len
    
    
# 初始化一個專門讀取訓練樣本的 Dataset，使用中文 BERT 斷詞
trainset = SentenceSimilarityDataset("train", trainingset_Q_name.split(".")[0] + "_classification.tsv")

In [None]:
"""
實作可以一次回傳一個 mini-batch 的 DataLoader
這個 DataLoader 吃我們上面定義的 `SentenceSimilarityDataset`，
回傳訓練 BERT 時會需要的 4 個 tensors：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

# 這個函式的輸入 `samples` 是一個 list，裡頭的每個 element 都是
# 剛剛定義的 `FakeNewsDataset` 回傳的一個樣本，每個樣本都包含 3 tensors：
# - tokens_tensor
# - segments_tensor
# - label_tensor
# 它會對前兩個 tensors 作 zero padding，並產生前面說明過的 masks_tensors

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    

    return tokens_tensors, segments_tensors, masks_tensors, label_ids


# 初始化一個每次回傳 64 個訓練樣本的 DataLoader
# 利用 `collate_fn` 將 list of samples 合併成一個 mini-batch 是關鍵
BATCH_SIZE = 32
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)

In [None]:
model = BertForSequenceClassification.from_pretrained(
    bert_pretrained_model, num_labels = 6) 

# high-level 顯示此模型裡的 modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))

In [None]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors.unsqueeze(1))
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

In [None]:
def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad]
     
model_params = get_learnable_params(model)
clf_params = get_learnable_params(model.classifier)

print(f"""
整個分類模型的參數量：{sum(p.numel() for p in model_params)}
線性分類器的參數量：{sum(p.numel() for p in clf_params)}
""")

In [None]:
%%time

# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# 14
EPOCHS = 5  # 幸運數字
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)
        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()


        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))
    

## epoch 14 batch 32
"""
[epoch 1] loss: 260.749, acc: 0.341
[epoch 2] loss: 219.299, acc: 0.374
[epoch 3] loss: 204.052, acc: 0.430
[epoch 4] loss: 189.664, acc: 0.498
[epoch 5] loss: 174.917, acc: 0.542
[epoch 6] loss: 159.116, acc: 0.587
[epoch 7] loss: 143.521, acc: 0.656
[epoch 8] loss: 130.123, acc: 0.612
[epoch 9] loss: 118.352, acc: 0.693
[epoch 10] loss: 108.212, acc: 0.791
[epoch 11] loss: 94.720, acc: 0.797
[epoch 12] loss: 80.216, acc: 0.837
[epoch 13] loss: 70.120, acc: 0.869
[epoch 14] loss: 59.383, acc: 0.895
CPU times: user 13min 15s, sys: 4min 29s, total: 17min 44s
Wall time: 18min 2s



_____________batch 32___________________________
[epoch 1] loss: 949.114, acc: 0.513
[epoch 2] loss: 858.487, acc: 0.538
[epoch 3] loss: 791.536, acc: 0.565
[epoch 4] loss: 694.887, acc: 0.624
[epoch 5] loss: 583.935, acc: 0.666
"""

In [None]:
torch.save(model, torch_model_output)

# Answer Preprocessing

In [None]:
# 要輸出答案時才需要執行這個區段

In [None]:
def deleteHTTP(row):
    # 先把所有網址全部刪除
    spaceSplit = re.split(' ', row)
    ignoreLIST = []
    for i in range(len(spaceSplit)):
        if spaceSplit[i] == "http" and spaceSplit[i+1] == ":" and spaceSplit[i+2][0:2] == "//":
            if i+3 < len(spaceSplit) and spaceSplit[i+3] in ["...", "?"]:
                ignoreLIST.append(i+3)
                ignoreLIST.append(i+4)
            ignoreLIST.append(i)
            ignoreLIST.append(i+1)
            ignoreLIST.append(i+2)
    ans = ""
    for i in range(len(spaceSplit)):
        if i not in ignoreLIST:
            if ans != "":
                ans += ' '
            ans += spaceSplit[i]
                    
    return ans


def generateAns(row):
    # 篩選特定句子輸出
    superKeys = list([
        "eg . ",
    ])
    for key in superKeys:
      findIdx = row.find(key)
      if findIdx != -1:
        row = row[findIdx+len(key):]

    Sentence_Min_length = 5
    # 斷句
    # splitrow: LIST[string]
    splitrow = re.split('([^?!,.:;]*[?!,.:;]+)', row)
        
    # 移除所有因斷句產生的空字串
    while '' in splitrow:
        splitrow.remove('')

    # index
    i = 0
         
    ans = ""
    writerow = ""

    realtext = list()

    while i < len(splitrow):
        # 先去除首尾沒必要的符號
        while splitrow[i] != '' and splitrow[i][0] in ["`", "-", "(", ")", "#"]:
            splitrow[i] = splitrow[i][1:]
            if splitrow[i][0] == " ":
                splitrow[i] = splitrow[i][1:]
        while splitrow[i] != '' and splitrow[i][-1] in ["`", "-", "(", ")", "#"]:
            splitrow[i] = splitrow[i][:-1]
            if splitrow[i][-1] == " ":
                splitrow[i] = splitrow[i][:-1]
                    
        # 0. 空字串
        if splitrow[i] == '':
            pass

        # 1. 把刪節號分開的句子連貫起來
        elif i+1 < len(splitrow) and len(splitrow[i]) > 1 and splitrow[i][-2] + splitrow[i][-1] == "..":
            realtext.append(splitrow[i] + " " + splitrow[i+1].strip())
            i += 1
                    
        # 2. 小數或超大數
        elif i+1 < len(splitrow) and splitrow[i][-1] in [".", ","] and splitrow[i+1][0].isdigit():
            realtext.append(splitrow[i] + splitrow[i+1].strip())
            i += 1

        # 3. 重複的標點符號但是有空格隔開
        elif i+1 < len(splitrow) and splitrow[i+1].strip() in ["?", "!", "."]:
            addSTR = splitrow[i]
            while i+1 < len(splitrow) and splitrow[i+1].strip() in ["?", "!", "."]:
                addSTR += splitrow[i+1]
                i += 1
            realtext.append(addSTR)

        else:
            realtext.append(splitrow[i])
        i += 1


    # 只有一句話
    # 整句當作答案輸出
    if len(realtext) == 1:
        ans = "".join(realtext)
        writerow = "".join(realtext)
    # 兩句話，但整段文本很短
    # 整段當作答案輸出
    elif len(realtext) == 2 and len(re.split(' ', realtext[0] + realtext[1])) < 15:
        ans = ''.join(realtext)
        writerow = "".join(realtext)

    # 太長的文本
    # elif len(realtext) > 10:
    #     # 先把長度縮減，避免塞不進 BERT
    #     while len(realtext) > 10:
    #         deleteID = -1
    #         maxLEN = 0
    #         for i in range(len(realtext)):
    #             tempSTR = re.split(' ', realtext[i])
    #             if len(tempSTR) > maxLEN:
    #                 deleteID = i
    #         realtext.remove(realtext[deleteID])
    #     writerow = "".join(realtext)
    else:    
        i = 0
        for i in range(len(realtext)):
            # 判斷片語或疑問詞
            # 句子太短
            if len(re.split(' ', realtext[i])) <= Sentence_Min_length:
                # 結尾是結束符號
                if realtext[i][-1] in ["?", "!", "."]:
                    ans += realtext[i].strip()
                # 結尾是停頓符號
                # 忽略
                elif realtext[i][-1] in [",", ":", ";"]:
                    pass
                # 其他
                else:
                    ans += realtext[i].strip()

            # 句子夠長<=10，但結尾是'?',':'
            # 把句子連貫
            elif len(re.split(' ', realtext[i])) <= 7 and realtext[i][-1] == "!":
                if ans != "":
                    ans += " "
                ans += realtext[i].strip()

            elif len(re.split(' ', realtext[i])) <= 10 and realtext[i][-1] == "?":
                if ans != "":
                    ans += " "
                ans += realtext[i].strip()

            elif len(re.split(' ', realtext[i])) <= 10 and realtext[i][-1] == ":":
                if ans != "":
                    ans += " "
                ans += realtext[i].strip()
                
            # 句子夠長，但結尾是','
            # 把句子連貫
            elif realtext[i][-1] == "," and len(re.split(' ', realtext[i])) <= 17:
                if ans != "":
                    ans += " "
                ans += realtext[i].strip()
            
            # 把這句納入訓練資料，結束
            else:
                # if ans != "":
                #     ans += " "
                # ans += realtext[i].strip()
                if ans != "":
                    ans += " "
                ans += realtext[i].strip()
                # else:
                    # temp = "".join(realtext)
                    # writerow += "".join(temp)
                if len(realtext) > 10 and len(realtext) < 20:
                    # 先把長度縮減，避免塞不進 BERT
                    while len(realtext) > 10:
                        deleteID = -1
                        maxLEN = 0
                        for i in range(len(realtext)):
                            tempSTR = re.split(' ', realtext[i])
                            if len(tempSTR) > maxLEN:
                                deleteID = i
                        realtext.remove(realtext[deleteID])
                writerow = "".join(realtext)
                break

    # 再去除開頭沒必要的符號和標題
    while ans != "" and ans[0] in [".", "Â", "`", "-", "(", ")", "#", "‘", "’", "“", "”", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0"]:
        ans = ans[1:].strip()
    # 再去除結尾沒必要的符號和標題
    # 注意不能把句號給去掉了
    while ans != "" and ans[-1] in ["Â", "`", "-", "(", ")", "#", "‘", "’", "“", "”", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0"]:
        ans = ans[:-1].strip()

    return [ans, writerow]



# 區分資料
def generatePreProcessDF(fileDatafram):
  test_data = fileDatafram
  row_length = test_data.shape[0]
  with open('ans_output.csv', 'w', encoding='utf-8', newline='') as outfile:
      writer = csv.writer(outfile)
      with open('test_output.csv', 'w', encoding='utf-8', newline='') as outfile2:
          writer2 = csv.writer(outfile2)

          writer.writerow(["id", 'q', 'r', 's'])
          writer2.writerow(["id", 'q', 'r', 's'])
          oldID = 0
          for row in range(row_length):
              rowID = test_data.iloc[row]["id"]
              rowQ = test_data.iloc[row]["q"]
              rowR = test_data.iloc[row]["r"]
              rowS = test_data.iloc[row]["s"]
              
              # 避免處理重複ID
              if oldID == rowID:
                  continue
              else:
                  oldID = rowID

                  # 把雙引號處理掉
                  rowQ = rowQ[1:-1]
                  rowR = rowR[1:-1]         
                              
                  # 先把所有網址全部刪除
                  httpQ = deleteHTTP(rowQ)       
                  httpR = deleteHTTP(rowR)

                                  
                  # 輸出給模型的答案
                  ansQ, modelQ = generateAns(httpQ)
                  ansR, modelR = generateAns(httpR)
                  # isTest = False
                  # # 寫入資料
                  # if modelQ != "":
                  #     ansQ = modelQ
                  #     isTest = True
                  # if modelR != "":
                  #     ansR = modelR
                  #     isTest = True
                  
                  # 檢查空答案
                  if ansQ == "":
                      ansQ = test_data.iloc[row]["q"][1:-1]
                      isTest = True
                  if ansR == "":
                      ansR = test_data.iloc[row]["r"][1:-1]
                      isTest = True

                  writer.writerow([rowID, ansQ, ansR, rowS])

                  if len(rowQ.split(" ")) > 100:
                    rowQ = rowQ[:50] + rowQ[-50:]
                  if len(rowR.split(" ")) > 100:
                    rowR = rowR[:50] + rowR[-50:]
                  writer2.writerow([rowID, rowQ, rowR, rowS])
                  # if isTest:
                  #     writer2.writerow([rowID, ansQ, ansR, rowS])
                  # else:
                  #     writer.writerow([rowID, ansQ, ansR, rowS])
                                
          outfile2.close()
      outfile.close()

# Predicting

In [None]:
def getAnwser() -> dict:
  """
  dict:{
    rowID:[
      {
        q': sent,
        r': sent,
      },
      {
        q': sent,
        r': sent,
      },
      ...
    ]
  }
  """
  df = pd.read_csv("./Batch_answers - train_data (no-blank).csv")
  empty_title = ((df['id'].isnull()) \
                | (df['q'].isnull()) \
                | (df['q'] == '') \
                | (df['q'] == '0'))
  df = df[~empty_title]
  AnwserMap = dict()
  for i in range(len(df.index)):
    rowID = df.iloc[i].id
    if not rowID in AnwserMap.keys():
      AnwserMap[rowID] = list()
    testcase = dict()      
    testcase["q'"] = df.iloc[i]["q'"][1:-1]
    testcase["r'"] = df.iloc[i]["r'"][1:-1]
    AnwserMap[rowID].append(testcase)

  return AnwserMap


def lcs(X, Y):
  m = len(X)
  n = len(Y)
  L = [[None]*(n + 1) for i in range(m + 1)]
  for i in range(m + 1):
    for j in range(n + 1):
      if i == 0 or j == 0 :
        L[i][j] = 0
      elif X[i-1] == Y[j-1]:
        L[i][j] = L[i-1][j-1]+1
      else:
        L[i][j] = max(L[i-1][j], L[i][j-1])
  return L[m][n]


def Scoring(ans, id, q, r) -> float:
  if not id in ans.keys():
    return 0
  cases = ans[id]
  maxScore = 0
  for case in cases:
    LCS1 = lcs(case["q'"], q)
    LCS2 = lcs(case["r'"], r)
    score = 0.5 *((LCS1 / (len(case["q'"]) + len(q) - LCS1)) +  (LCS2 / (len(case["r'"]) + len(r) - LCS2)))
    if score > maxScore:
      maxScore = score
  return maxScore
ans_dict = getAnwser()


In [None]:
def testing():  
  df = pd.read_csv("test_output.csv")
  #print("input cases:", df.index)
  df = df.reset_index()
  df = df.loc[:, ["id", "q", "r", 's']]
  df.columns = ["id", "q", 'r', 's']

  ## predict testcases q'
  list_testidq = list()
  list_testq = list()
  list_testsubq = list()
  for i in range(len(df.index)):
    for subq in getSubsentences(df.iloc[i].q):
      list_testidq.append(df.iloc[i].id)
      list_testq.append(df.iloc[i].q)
      list_testsubq.append(subq)
  df_testq = pd.DataFrame(dict({"q": list_testq, "q'":list_testsubq}))
  df_testq.to_csv("test_q.tsv", sep="\t", index=False)
  model = torch.load('./model_q.dat', map_location=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
  testset = SentenceSimilarityDataset("test", "test_q.tsv")
  testloader = DataLoader(testset, batch_size=10, collate_fn=create_mini_batch)
  predictions_q = get_predictions(model, testloader).tolist()
  torch.cuda.empty_cache()
  df_predq = pd.DataFrame(dict({"id":list_testidq, "q":list_testq, "q'":list_testsubq, "f": predictions_q}))


  ## predict testcases r'
  list_testidr = list()
  list_testr = list()
  list_testsubr = list()
  for i in range(len(df.index)):
    df.iloc[i].r = str(df.iloc[i].r)
    for subr in getSubsentences(df.iloc[i].r):
      list_testidr.append(df.iloc[i].id)
      list_testr.append(df.iloc[i].r)
      list_testsubr.append(subr)
  df_testr = pd.DataFrame(dict({"r": list_testr, "r'":list_testsubr}))
  df_testr.to_csv("test_r.tsv", sep="\t", index=False)

  model = torch.load('./model_r.dat', map_location=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
  testset = SentenceSimilarityDataset("test", "test_r.tsv")
  testloader = DataLoader(testset, batch_size=10, collate_fn=create_mini_batch)
  predictions_r = get_predictions(model, testloader).tolist()
  torch.cuda.empty_cache()
  df_predr = pd.DataFrame(dict({"id":list_testidr, "r":list_testr, "r'":list_testsubr, "f": predictions_r}))
  ## calculate q' r' s
  ## pass
  ## predict_ans
  pre_ans = pd.read_csv("ans_output.csv")
  pre_ans_dict = dict()
  for i in range(len(pre_ans.index)):
    pre_ans_dict[pre_ans.iloc[i].id] = dict()
    pre_ans_dict[pre_ans.iloc[i].id]["q"] = pre_ans.iloc[i].q
    pre_ans_dict[pre_ans.iloc[i].id]["r"] = pre_ans.iloc[i].r



  ## merge id q r
  ans_id = list()
  ans_q  = list()
  ans_r  = list()
  for i in range(len(df.index)):
    row = df.iloc[i]
    ans_id.append(row["id"])
    # find q'
    tempq = ""
    tempf = 4
    if row["id"] in pre_ans_dict.keys():
      tempq = pre_ans_dict[row["id"]]['q']
    if row["id"] in list_testidq:
      count = 0
      for j in range(len(list_testidq)):
        if list_testidq[j] == row["id"]:
          if predictions_q[j] > tempf:
            tempq += list_testsubq[j] + " "
            count += 1
            if count > 3:
              break
            #tempf = predictions_q[j] 
    else:
      if tempq == "":
        for tempSub in getSubsentences(row["q"]):
          if len(tempq) < 50:
            tempq += tempSub.strip() + " "
          else:
            break
    tempq.strip()
    ans_q.append('"' + tempq +'"')
    # find r'
    tempr = ""
    tempf = 4
    if row["id"] in pre_ans_dict.keys():
      tempr = pre_ans_dict[row["id"]]['r']
    if row["id"] in list_testidr:
      count = 0
      for j in range(len(list_testidr)):
        if list_testidr[j] == row["id"]:
          if predictions_r[j] > tempf:
            tempr += list_testsubr[j]+ " "
            count += 1
            if count > 3:
              break
            #tempf = predictions_r[j]
    else:
      if tempr == "":
        for tempSub in getSubsentences(row["r"]):
          if len(tempr) < 50:
            tempr += tempSub.strip() + " "
          else:
            break
    tempr.strip()
    ans_r.append('"' + tempr +'"')


  dfans = pd.DataFrame(dict({"id":ans_id, "q":ans_q, "r":ans_r}))
  #print("ans cases:", dfans.index)
  dfans.to_csv("ans.csv", sep=",", index=False)


In [None]:
import random
def testandscore(mode):
  for i in range(1, 2):
    filename = "./Batch_answers - test_data(no_label).csv"
    dataframe = pd.read_csv(filename)
    # dataframe = dataframe.sample(n = 100, random_state=i*1421)
    generatePreProcessDF(dataframe)
    testing()
    
    if mode == "test":
      return
    print("Test Cases", i, ":", end="  \n")
    output = pd.read_csv("ans.csv")
    total = 0.0
    for i in range(output.shape[0]):
      total += Scoring(ans_dict, int(output.iloc[i]["id"]), str(output.iloc[i]["q"][1:-1]), str(output.iloc[i]["r"][1:-1]))
    print("\tans:", total/output.shape[0], end="  \n")

    outfile_id = list()
    outfile_q = list()
    outfile_r = list()
    outfile_score = list()

    output = pd.read_csv("ans_output.csv")
    total = 0.0
    for i in range(output.shape[0]):
      scr = Scoring(ans_dict, int(output.iloc[i]["id"]), str(output.iloc[i]["q"]), str(output.iloc[i]["r"]))
      total += scr
      outfile_id.append(output.iloc[i]["id"])
      outfile_q.append(output.iloc[i]["q"])
      outfile_r.append(output.iloc[i]["r"])
      outfile_score.append(scr)
    print("\tans_output:", total/output.shape[0], end="  \n")
    df_out = pd.DataFrame(dict({
        "id":outfile_id,
        "q":outfile_q,
        "r":outfile_r,
        "score":outfile_score
    }))
    df_out.to_csv("ansout_with_score_"+ str(i+int('0')) + ".csv", sep=",", index=False)

testandscore("test")