# Data Preprocess for fine-tuning dataset

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import os
import gc
from typing import Tuple, Dict
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import datasets
import json
import shutil
from collections import Counter
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from tqdm import tqdm
import torchsummary
import Levenshtein
import numpy as np
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# DEVICE = 'cpu'
print(DEVICE)

cuda


# Tokenizer

In [3]:
import re
import os
from collections import Counter

class UpgradeTokenizer2:
    def __init__(self, max_vocab_size, punctuations=['.', ',', '!', '?', ':', ';', '-', '(', ')']):
        self.vocab = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3, '[MASK]': 4}
        self.mask_token = '[MASK]'

        self.max_vocab_size = max_vocab_size
        self.punctuations = punctuations

    def custom_tokenize(self, text):
        # Generate a regex pattern that excludes specified punctuations
        pattern = r"\b\w+'?\w*|[^\w\s]"

        tokens = re.findall(pattern, text.lower())
        return tokens

    def build_vocab(self, corpus):
        word_counts = Counter(word for sentence in corpus for word in self.custom_tokenize(sentence))
        for word, _ in word_counts.most_common(self.max_vocab_size - len(self.vocab)):
            self.vocab[word] = len(self.vocab)
    
    def tokenize(self, text):
        return [self.vocab.get(word, self.vocab['[UNK]']) for word in self.custom_tokenize(text)]

    def convert_tokens_to_string(self, tokens):
        words = [list(self.vocab.keys())[list(self.vocab.values()).index(token)] for token in tokens]
        sentence = ''
        for word in words:
            if word in self.punctuations:
                sentence += word
            else:
                if sentence and not sentence.endswith(' '):
                    sentence += ' '
                sentence += word
        return sentence

tokenizer = UpgradeTokenizer2(max_vocab_size=60010)

def read_corpus(folder_path):
    corpus = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'): 
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                corpus.append(file.read())
    return corpus

### Load pre-train VOCAB

In [4]:
vocab_file = 'vocab60000-latest.json'
with open(vocab_file, 'r') as f:
    VOCAB = json.load(f)

tokenizer.vocab = VOCAB

### Add new VOCAB

In [45]:
VOCAB['[ARTICLE]'] = 60000
VOCAB['[SUMMARY]'] = 60001
VOCAB['[CONTEXT]'] = 60002
VOCAB['[QUESTION]'] = 60003
VOCAB['[ANSWER]'] = 60004
VOCAB['[UNANSWERABLE]'] = 60004


## Write new VOCAB into json file

In [14]:
vocab_file_fine = 'vocab60000-latest_fine.json'
with open(vocab_file_fine, 'w') as f:
    json.dump(VOCAB, f)

### Check VOCAB

In [6]:
for key, value in enumerate(tokenizer.vocab):
    if key ==60002:
        print(key, value)
        break

60002 [QUESTION]


# Load Dataset from disk which previously downloaded from Huggingface

In [43]:
train_qa_dataset = datasets.load_from_disk("/root/autodl-tmp/squad/train")
val_qa_dataset = datasets.load_from_disk("/root/autodl-tmp/squad/val")

### For QA dataset

In [64]:
def tokenize_and_convert_to_ids(text):
        # Tokenize the input text using the custom tokenize method
    tokens_context = tokenizer.custom_tokenize(text["context"])
    tokens_question = tokenizer.custom_tokenize(text["question"])
    # Add the special tokens
    tokens = ['[CLS]'] + ['[CONTEXT]'] + tokens_context + ['[QUESTION]'] + tokens_question
    
    tokens_answers = []
    start_end = []
    if len(text["answers"]['text']) == 0:
        tokens_answers += ['[ANSWER]']
        tokens_answers += ['[UNANSWERABLE]']
        start_end.append((-1, -1))

    else:
        for i in range(len(text["answers"]['text'])):
            curr = tokenizer.custom_tokenize(text["answers"]['text'][i])
            tokens_answers += ['[ANSWER]']
            tokens_answers += curr
            start_end.append((text["answers"]['answer_start'][i]+2+(i+1), text["answers"]['answer_start'][i]+2+(i+1)+len(curr)+(-1)))
        
    tokens += tokens_answers
    tokens += ['[SEP]']
    
    

    # Convert the list of tokens to token IDs
    token_ids = [tokenizer.vocab.get(token, tokenizer.vocab['[UNK]']) for token in tokens]

    return token_ids, start_end

def process_files(dataset):
    all_tokenized_arrays = []
    all_position = []
    for data in dataset:
        tokenized_ids, position = tokenize_and_convert_to_ids(data)
        np_array_ids = np.array(tokenized_ids, dtype = int)
        np_array_pos = np.array(position, dtype = int)
        all_tokenized_arrays.append(np_array_ids)
        all_position.append(np_array_pos)
    
    return all_tokenized_arrays, all_position

    # Save all arrays into a single .npy file

tokenized_arrays, pos = process_files(val_qa_dataset)
np.save('/root/sq_val_tokenized_60000.npy', np.array(tokenized_arrays, dtype=object))
np.save('/root/sq_val_pos_60000.npy', np.array(pos, dtype=object))

#### Check the npy file

In [66]:
val_qa_dataset_npy     = np.load('/root/sq_val_tokenized_60000.npy', allow_pickle=True)
print(val_qa_dataset[0])
tokenizer.convert_tokens_to_string(val_qa_dataset_npy[0])


{'id': '56be4db0acb8001400a502ec', 'title': 'Super_Bowl_50', 'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.', 'question': 'Which NFL team represented the AFC at Super Bowl 50?', 'answers': {'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'], 'ans

'[CLS] [CONTEXT] super bowl 50 was an american football game to determine the champion of the national football league( nfl) for the 2015 season. the american football conference( afc) champion denver broncos defeated the national football conference( nfc) champion carolina panthers 24 – 10 to earn their third super bowl title. the game was played on february 7, 2016, at levi\'s stadium in the san francisco bay area at santa clara, california. as this was the 50th super bowl, the league emphasized the " golden anniversary " with various gold- themed initiatives, as well as temporarily suspending the tradition of naming each super bowl game with roman numerals( under which the game would have been known as " super bowl l "), so that the logo could prominently feature the arabic numerals 50. [QUESTION] which nfl team represented the afc at super bowl 50? [ANSWER] denver broncos [ANSWER] denver broncos [ANSWER] denver broncos [SEP]'

### For CNN dataset

In [72]:
def tokenize_and_convert_to_ids(text):
        # Tokenize the input text using the custom tokenize method
    tokens_article = tokenizer.custom_tokenize(text["article"])
    tokens_summary = tokenizer.custom_tokenize(text["highlights"])

    # Add the special tokens
    tokens = ['[CLS]'] + ['[ARTICLE]'] + tokens_article + ['[SUMMARY]'] + tokens_summary + ['[SEP]']

    # Convert the list of tokens to token IDs
    token_ids = [tokenizer.vocab.get(token, tokenizer.vocab['[UNK]']) for token in tokens]

    return token_ids

def process_files(dataset):
    all_tokenized_arrays = []
    for data in dataset:
        tokenized_ids = tokenize_and_convert_to_ids(data)
        np_array = np.array(tokenized_ids, dtype = int)
        all_tokenized_arrays.append(np_array)
    
    return all_tokenized_arrays

    # Save all arrays into a single .npy file

all_tokenized_arrays = process_files(test_CNN_dataset)
np.save('/root/CNN_test_tokenized_60000.npy', np.array(all_tokenized_arrays, dtype=object))

#### Check npy file

In [73]:
train_cnn_dataset     = np.load('/root/CNN_train_tokenized_60000.npy', allow_pickle=True)
val_cnn_dataset     = np.load('/root/CNN_val_tokenized_60000.npy', allow_pickle=True)
val_cnn_dataset