<a href="https://colab.research.google.com/github/KimDaeUng/final_meta_transfer/blob/master/001_ARSC_TASKEMB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
from os import path
from google.colab import drive

drive.mount('/content/drive')

In [None]:
!git clone https://github.com/Gorov/DiverseFewShot_Amazon.git

In [None]:
!git clone https://github.com/huggingface/transformers
%cd transformers
!pip install .
!pip install -r ./examples/requirements.txt
%cd ..

In [None]:
!pip install pyarrow==1.0

In [None]:
%cd '/content/transformers/examples/'

In [None]:
!git clone https://github.com/KimDaeUng/final_meta_transfer.git

In [None]:
import os
import torch
import csv
from torch.utils.data import Dataset
import numpy as np
import collections
import random
import json, pickle
import tqdm
from torch.utils.data import TensorDataset


# Preprocessing

## Load dataset

In [None]:
filelist = "/content/DiverseFewShot_Amazon/Amazon_few_shot/workspace.filtered.list"

In [None]:
def load_train_test_files(listfilename, test_suffix='.test'):
    filein = open(listfilename, 'r')
    file_tuples = []
    for line in filein:
        for k in ['2', '4','5']:
          array = line.strip().split('\t')
          line = array[0]
          trainfile = line + ".t{}".format(k) + '.train'
          devfile = line + ".t{}".format(k) +  '.dev'
          testfile = line + ".t{}".format(k) +  test_suffix
          file_tuples.append((trainfile, devfile, testfile))
    filein.close()
    return file_tuples

In [None]:
datasets = []
list_dataset = []
file_tuples = load_train_test_files(filelist)

In [None]:
pwd

In [None]:
cd "/content/DiverseFewShot_Amazon/"

In [None]:
workingdir = 'Amazon_few_shot'

In [None]:
def read_tsv(input_file):
    with open(input_file, "r") as f:
        texts, labels = [], []
        while True:
          line = f.readline()
          if not line: break
          text, label = line.strip().split('\t')
          texts.append(text)
          labels.append(int(label) if label == '1' else 0)
        return texts, labels

In [None]:
datasets = {}
for (trainfile, devfile, testfile) in tqdm.tqdm(file_tuples):
    data_domain = trainfile.split(".train")[0]
    data = {}
    train_path = os.path.join(workingdir, trainfile)
    dev_path = os.path.join(workingdir, devfile)
    test_path = os.path.join(workingdir, testfile)
    text, label = read_tsv(train_path)
    data['train'] = {'text' : text, 'label' : label} 
    text, label = read_tsv(dev_path)
    data['dev'] = {'text' : text, 'label' : label} 
    text, label = read_tsv(test_path)
    data['test'] = {'text' : text, 'label' : label} 
    datasets[data_domain] = data

In [None]:
datasets.keys()

In [None]:
datasets['apparel.t2']['train']['text'][1], datasets['apparel.t2']['train']['label'][1]

## View Stats 

In [None]:
df_stat = {'domain' : [], 'train' : [], 'dev' : [], 'test' : []}
for key, value in datasets.items():
    df_stat['domain'].append(key)
    for k, v in value.items():
        df_stat[k].append(len(v['text']))

In [None]:
import pandas as pd
df_s = pd.DataFrame(df_stat)
pd.set_option('display.max_rows', 69)
df_s['meta_split'] = df_s.domain.apply(lambda x : 'meta-test' if any([(i in x) for i in ['books', 'dvd', 'electronics', 'kitchen_housewares']]) else 'meta-train'  )
df_s = df_s.sort_values(['meta_split', 'domain'], ascending=[False, True])

In [None]:
# Import seaborn library 
import seaborn as sns 

# Declaring the cm variable by the 
# color palette from seaborn 
cm = sns.light_palette("green", as_cmap=True) 

# Visualizing the DataFrame with set precision 
print("\nStatistics of the Dataset:") 
df_s.style.background_gradient(cmap=cm).set_precision(2)


# Embedding and Visualizing

### Tokenization

In [None]:
!mkdir "/content/transformers/examples/final_meta_transfer/data/model"

In [None]:
# Set BERT tokenizer
from transformers import BertTokenizer
path_model = "/content/transformers/examples/final_meta_transfer/data/model"
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=path_model)

#### nlp library 

In [None]:
import nlp
from datasets import Dataset

In [None]:
df_nlp = {}
for key, value in datasets.items():
    df_nlp_inner = {}
    for split, textnlabel_dict in value.items():
        df_nlp_inner[split] = Dataset.from_dict(textnlabel_dict)
    df_nlp[key] = df_nlp_inner

In [None]:
df_nlp['apparel.t2']['train']['label'][0]

#### Basic Method

In [None]:
# Create dataset  
def create_feature_set(examples, tokenizer, max_seq_length=512):
    input_len = len(examples['text'])
    all_input_ids      = torch.empty(input_len, max_seq_length, dtype = torch.long)
    all_attention_mask = torch.empty(input_len, max_seq_length, dtype = torch.long)
    all_segment_ids    = torch.empty(input_len, max_seq_length, dtype = torch.long)
    all_lengths    = torch.empty(input_len, dtype = torch.long)
    all_label_ids      = torch.empty(input_len, dtype = torch.long)

    for id_,example in enumerate(zip(examples['text'], examples['label'])):
        input_ids = tokenizer.encode(example[0], max_length=max_seq_length, truncation=True)
        if len(input_ids) > max_seq_length:
            print("input_ids exceeds max_seq_length : {} > {}".format(len(input_ids), max_seq_length))
            input_ids = input_ids[:max_seq_length-1] + [input_ids[-1]]

        attention_mask = [1] * len(input_ids)
        segment_ids    = [0] * len(input_ids)

        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            attention_mask.append(0)
            segment_ids.append(0)

        label_id = example[1]
        all_input_ids[id_] = torch.Tensor(input_ids).to(torch.long)
        all_attention_mask[id_] = torch.Tensor(attention_mask).to(torch.long)
        all_segment_ids[id_] = torch.Tensor(segment_ids).to(torch.long)
        all_label_ids[id_] = torch.Tensor([label_id]).to(torch.long)
        all_lengths[id_] = torch.Tensor([sum(attention_mask)]).to(torch.long)

    tensor_set = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_lengths, all_label_ids)  
    return tensor_set

In [None]:
apparel_t2 = create_feature_set(datasets['apparel.t2']['train'], tokenizer)

In [None]:
apparel_t2[0]

In [None]:
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)

In [None]:
data_loader = DataLoader(apparel_t2,  batch_size=128)

In [None]:
!mkdir "/content/drive/My Drive/Colab Notebooks/Final/meta_transfer/data/amazon_emb"

In [None]:
# get path to save embedding tensor file(*.pt)
emb_path = "/content/drive/My Drive/Colab Notebooks/Final/meta_transfer/data/amazon_emb"

In [None]:
# Load pretrained model
from transformers import BertModel
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True) 
model.to('cuda')

In [None]:
def save_emb(model, data_loader, task_name, split):
    model.eval()
    for i, batch in tqdm.tqdm(enumerate(data_loader)):
        inp, seg, att, leng, label = batch
        i = i+1
        if i != len(data_loader):
            i = i * data_loader.batch_size
        else:
            i = data_loader.batch_size*(len(data_loader)) + len(data_loader.dataset) % len(data_loader)
        
        save_path = os.path.join(emb_path,"{}_{}_{}.pt".format(task_name, split, str(i).zfill(5)))
        print(save_path)
        if os.path.isfile(save_path):
            continue
        else:
            with torch.no_grad():
                hidden = model(inp.cuda(), seg.cuda(), att.cuda())[2]
                token= hidden[-1].to("cpu")
                sentence = torch.mean(token, dim=1)
                bat = {"sentence" : sentence, "length":leng, 'label' : label}
                torch.save(bat, save_path)

In [None]:
def get_taskemb(model, data_loader, task_name, split):
    model.eval()

    task_emb = torch.empty(768)
    task_norm = torch.empty(1)
    for i, batch in tqdm.tqdm(enumerate(data_loader)):
        inp, seg, att, leng, label = batch
        i = i+1
        if i != len(data_loader):
            i = i * data_loader.batch_size
        else:
            i = data_loader.batch_size*(len(data_loader)) + len(data_loader.dataset) % len(data_loader)
        
        print("{}_{}_{}".format(task_name, split, str(i).zfill(3)))
        with torch.no_grad():
            hidden = model(inp.cuda(), seg.cuda(), att.cuda())[2]
            hx = hidden[-1].to("cpu").detach()
            hx = torch.mean(hx, dim=1)

            task_norm += torch.sum(torch.square(hx))
            # Batch-wise summation
            hx = torch.sum(hx, dim=0)
            task_emb += hx

    task_norm = torch.sqrt(task_norm)
    
    return task_emb/task_norm

### Get TASKEMBs

In [None]:
# [TO DO] add dev sets to this
task_emb_dic = {}
for task, corpus in datasets.items():
    print("-"*50)
    print("Task : ", task)
    encoded_corpus = create_feature_set(corpus['train'], tokenizer)
    data_loader = DataLoader(encoded_corpus,  batch_size=128)
    task_emb = get_taskemb(model, data_loader, task, 'train')
    task_emb_dic[task] = task_emb


In [None]:
# Save Task Embedding
task_emb_path = '/content/drive/My Drive/Colab Notebooks/Final/meta_transfer/data/task_emb_dic.pt'
torch.save(task_emb_dic, task_emb_path)

In [None]:
task_emb_dic

# Below Codes for Meta-Training Data Preprocessing

In [None]:
# # For whole dataset, preprocess (Not Recommanded)
# df_nlp_process = {}
# for key, value in df_nlp.items():
#     print("Task : ", key)
#     df_nlp_process_inner = {}
#     for split, dset in value.items():
#         print("\t", split)
#         df_nlp_process_inner[split] = dset.map(
#         lambda x: tokenizer(x['text'], padding=True,
#                             return_length=True),batched=True)
#     df_nlp_process[key] = df_nlp_process_inner

In [None]:
LABEL_MAP  = {'positive':0, 'negative':1, 0:'positive', 1:'negative'}

class MetaTask(Dataset):
    
    def __init__(self, examples, num_task, k_support, k_query, tokenizer):
        """
        :param samples: list of samples
        :param num_task: number of training tasks.
        :param k_support: number of support sample per task
        :param k_query: number of query sample per task
        """
        self.examples = examples
        random.shuffle(self.examples)
        
        self.num_task = num_task
        self.k_support = k_support
        self.k_query = k_query
        self.tokenizer = tokenizer
        self.max_seq_length = 512
        self.create_batch(self.num_task)
    
    def create_batch(self, num_task):
        self.supports = []  # support set
        self.queries = []  # query set
        
        for b in range(num_task):  # for each task
            # 1.select domain randomly
            domain = random.choice(self.examples.keys())
            domainExamples = [e for e in self.examples.items() if domain in e[0] ]
            
            # 1.select k_support + k_query examples from domain randomly
            selected_examples = random.sample(domainExamples,self.k_support + self.k_query)
            random.shuffle(selected_examples)
            exam_train = selected_examples[:self.k_support]
            exam_test  = selected_examples[self.k_support:]
            
            self.supports.append(exam_train)
            self.queries.append(exam_test)

    def create_feature_set(self,examples):
        all_input_ids      = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_attention_mask = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_segment_ids    = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_label_ids      = torch.empty(len(examples), dtype = torch.long)

        for id_,example in enumerate(examples):
            input_ids = self.tokenizer.encode(example['text'])
            attention_mask = [1] * len(input_ids)
            segment_ids    = [0] * len(input_ids)

            while len(input_ids) < self.max_seq_length:
                input_ids.append(0)
                attention_mask.append(0)
                segment_ids.append(0)

            label_id = LABEL_MAP[example['label']]
            all_input_ids[id_] = torch.Tensor(input_ids).to(torch.long)
            all_attention_mask[id_] = torch.Tensor(attention_mask).to(torch.long)
            all_segment_ids[id_] = torch.Tensor(segment_ids).to(torch.long)
            all_label_ids[id_] = torch.Tensor([label_id]).to(torch.long)

        tensor_set = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)  
        return tensor_set
    
    def __getitem__(self, index):
        support_set = self.create_feature_set(self.supports[index])
        query_set   = self.create_feature_set(self.queries[index])
        return support_set, query_set

    def __len__(self):
        # as we have built up to batchsz of sets, you can sample some small batch size of sets.
        return self.num_task

In [None]:
def get_train_examples(self, data_dir):
    """See base class."""
    logger.info("LOOKING AT {} train".format(data_dir))
    return self._create_examples(self._read_csv(data_dir)), "train")

In [None]:
def _read_tsv(input_file):
    with open(input_file, "r", encoding="utf-8") as f:
        return list(csv.reader(f, delimiter='\t'))

In [None]:
data_kit = _read_tsv(path)

In [None]:
data_kit[:2]

In [None]:
def _create_examples(lines, type):
    """Creates examples for the training and dev sets."""
    if type == "train" and lines[0][-1] != "label":
        raise ValueError("For training, the input file must contain a label column.")

    examples = [
        InputExample(
            example_id=id,
            question=line[5],  # in the swag dataset, the
            # common beginning of each
            # choice is stored in "sent2".
            contexts=[line[4], line[4], line[4], line[4]],
            endings=[line[7], line[8], line[9], line[10]],
            label=line[11],
        )
        for id, line in enumerate(lines)  # we skip the line with the column names
    ]

    return examples

In [None]:
cd DiverseFewShot_Amazon/