In [1]:
import os
os.environ['PYTHONHASHSEED'] = str(2019)
os.environ['TRANSFORMERS_CACHE'] = 'D:\\python_pkg_data\\huggingface\\transformers'

import json
from tqdm import tqdm_notebook

import numpy as np 
np.random.seed(2019)
import random
random.seed(2019)

import torch
torch.manual_seed(2019)
torch.cuda.manual_seed_all(2019)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

from sklearn.utils import shuffle
import transformers



from datasets import load_metric,load_dataset,Value
import csv


import nltk
nltk.data.path.append('D:\\python_pkg_data\\nltk_data')
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

import ast
import glob
import shutil

import importlib
from torch.utils.data import DataLoader
from torch.nn import Softmax
from termcolor import colored
from itertools import groupby
from operator import itemgetter
import html
from IPython.core.display import display, HTML
import more_itertools as mit
from tqdm import tqdm_notebook

In [1]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
torch.cuda.is_available()

True

In [2]:
args = {

    'ori_train_dir':'./datasets/IMDb/orig/train.tsv',
    'gpu_device':0,
    'tokenizer': transformers.AutoTokenizer.from_pretrained('roberta-base'),
    'dataset_cache_dir':"D:\\python_pkg_data\\huggingface\\Datasets", ## local directory for datasets
    'train_random_seed':2019,                                        ## random seed for subsampling training set
    'num_per_class': 25,                                             ## number of examples per class for initial training set
    'save_dir': './AL_results/IMDb_step0_al_trainer',                                   ##directory for saving models
    'num_per_step':50,                                                ##num labelled data per step
    'num_per_example':7
}

In [3]:

class CustomerDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [4]:
 ## import training set
IMDb_data = {}

with open(args['ori_train_dir'],errors='ignore') as file:
    file = csv.reader(file, delimiter="\t")
    for idx,row in enumerate(file):
        if len(row)>0:

            if row[0] == 'Negative':
                IMDb_data[row[2]] = {'text':row[1],'label':0}
            else:
                IMDb_data[row[2]] = {'text':row[1],'label':1}

In [5]:
imdb_texts = {}
imdb_labels = {}
for key in IMDb_data.keys():
    imdb_texts[key] = IMDb_data[key]['text']
    imdb_labels[key] = IMDb_data[key]['label']

In [6]:
print(np.unique(list(imdb_labels.values())),np.bincount(list(imdb_labels.values())))

[0 1] [850 857]


# Most uncertainty examples

In [7]:
train_keys = []
with open(f"{args['save_dir']}_{args['train_random_seed']}_{args['num_per_class']}_{args['num_per_example']}/keys.txt") as f:
    for line in f.readlines():
        train_keys.append(line[:-1])
          
unlabelled_keys = list(set(list(imdb_labels.keys()))-set(train_keys))

len(train_keys)

unlabelled_texts = [imdb_texts[key] for key in unlabelled_keys]
unlabelled_labels = [imdb_labels[key] for key in unlabelled_keys]
unlabelled_encodings = args['tokenizer'](unlabelled_texts, truncation=True, padding=True)

print('IMDb unlabelled data statistics -----------------------')
print(np.unique(unlabelled_labels),np.bincount(unlabelled_labels))

unlabelled_dataset = CustomerDataset(unlabelled_encodings, unlabelled_labels)
unlabelled_dataloader = torch.utils.data.DataLoader(unlabelled_dataset, batch_size=32,shuffle=False)

model_dir = glob.glob(f"{args['save_dir']}_{args['train_random_seed']}_{args['num_per_class']}_{args['num_per_example']}/checkpoint*")[0]
print(f"previous model {model_dir}")
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=2).cuda(args['gpu_device'])

model.eval()
logits_list = []
sm = torch.nn.Softmax(dim=1)
for batch in tqdm_notebook(unlabelled_dataloader):
    batch = {k: v.cuda(args['gpu_device']) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    logits = sm(logits)
    logits_list = logits_list + logits.tolist()

difference = np.array([abs(result[0]-result[1]) for result in logits_list])

uncertainty_index = difference.argsort()[:args['num_per_step']]
# certainty_index = difference.argsort()[-args['num_per_step']:]


new_keys = np.array(unlabelled_keys)[uncertainty_index]
# new_keys = np.array(unlabelled_keys)[certainty_index]
train_keys = np.append(train_keys, new_keys)

with open(f"{args['save_dir']}_{args['train_random_seed']}_{args['num_per_class']}_{args['num_per_example']}/new_keys.txt", "w") as fp:
    for k in new_keys:
        fp.write(str(k) +"\n")

IMDb unlabelled data statistics -----------------------
[0 1] [825 832]
previous model ./AL_results/IMDb_step0_al_trainer_2019_25_7\checkpoint-13


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/52 [00:00<?, ?it/s]