<a href="https://colab.research.google.com/github/M1F1/MasterThesis/blob/master/SemiSupervisedComposableFramework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup env 

In [1]:
from google.colab import drive
from pathlib import Path
import importlib
import pkg_resources

drive.mount('/content/gdrive')

if importlib.util.find_spec('neptune') is None:
  !pip install neptune-client

if importlib.util.find_spec('pytorch_lightning') is None:
  !pip install pytorch-lightning

if importlib.util.find_spec('logzero') is None:
  !pip install logzero 

if importlib.util.find_spec('tensorboardX') is None:
  !pip install tensorboardX 

if importlib.util.find_spec('lineflow') is None:
  !pip install lineflow

if importlib.util.find_spec('optuna') is None:
  !pip install optuna

#if importlib.util.find_spec('gdown') is None:
!pip install gdown==3.11.0
  
if importlib.util.find_spec('transformers') is None:
  !pip install transformers 
  
if importlib.util.find_spec('nlpaug') is None:
  !pip install nlpaug 

import gdown
import contextlib
import glob
import shutil
import os
from functools import partial
from collections import OrderedDict
from typing import Dict
import re
import time

import lineflow as lf
import lineflow.datasets as lfds

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
from torch.nn import CrossEntropyLoss
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, RandomSampler, Dataset, sampler
import torch.nn.functional as F

import sklearn
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn import preprocessing

import pytorch_lightning as pl
if pkg_resources.parse_version(pl.__version__) < pkg_resources.parse_version("0.7.1"):
  raise RuntimeError("PyTorch Lightning>=0.7.1 is required for this code.")
from pytorch_lightning import LightningModule
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.logging.neptune import NeptuneLogger 
from pytorch_lightning import Callback

from gensim.utils import tokenize as gensim_tokenizer
import gensim
from gensim.models.fasttext import FastText as FT_gensim

from transformers import BertModel, BertTokenizer, RobertaTokenizer, RobertaModel
from transformers import AdamW, get_linear_schedule_with_warmup 
import random
import numpy as np
import pandas as pd
import spacy
import nltk
import toolz
from nltk.corpus import stopwords
import optuna
from optuna.integration import PyTorchLightningPruningCallback

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.model.word_stats as nmw
import nlpaug.flow as nafc
from nlpaug.util import Action

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
!pip freeze > requirements.txt

os.environ['PROJECT_PATH'] = str(Path()/'gdrive'/'My Drive'/'praca_magisterska'/'pytorch_lightning')
os.environ['DATASETS_PATH'] = str(Path()/'gdrive'/'My Drive'/'praca_magisterska'/'pytorch_lightning'/'datasets')
os.environ['REQUIREMENTS_PATH'] = str(Path()/'requirements.txt')
# Watch out for this path
os.environ['RESULT_PATH'] = str(Path()/'result')
os.environ['SPELLING_PATH'] = str(Path()/'gdrive'/'My Drive'/'praca_magisterska'/'pytorch_lightning'/'nlpaug'/'spelling_en.txt') 
os.environ['NLPAUG_PATH'] = str(Path()/'gdrive'/'My Drive'/'praca_magisterska'/'pytorch_lightning'/'nlpaug')
artefacts_temp_dir = os.path.join(os.environ['PROJECT_PATH'], 'parametrized_nbs')

neptune_api_token_key_file = str(Path()/'gdrive'/'My Drive'/'praca_magisterska'/'neptune_api_token.txt')
with open (neptune_api_token_key_file, 'r') as f:
  os.environ['NEPTUNE_API_TOKEN'] = f.readlines()[0]

if not os.path.exists(artefacts_temp_dir):
  os.makedirs(artefacts_temp_dir)

if not os.path.exists(os.environ['RESULT_PATH']):
  os.makedirs(os.environ['RESULT_PATH'])




Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Dataset statistics

#### IMDB

In [2]:
# train = lfds.Imdb('train')
# test = lfds.Imdb('test')
# dataset = train + test
# ds = dataset.map(lambda x: {'text': x[0], 'label':x[1], 'tokens_len': len(list(gensim_tokenizer(x[0])))})
# df = pd.DataFrame(ds)
# df['tokens_len'].describe()

### Embeders

In [3]:
def create_ft_embeder():
  my_file = Path("./cc.en.300.bin")
  # my_file = Path("./wiki-news-300d-1M.vec.zip")
  if not my_file.is_file():
    !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
    # !https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
    !gunzip cc.en.300.bin.gz
  print('Loading fastText model into memory, it can take a while...')
  start = time.time()
  # ft = gensim.models.fasttext.load_text_format()  # Original fasttext embeddings from https://fasttext.cc/
  ft = gensim.models.FastText.load_fasttext_format("./cc.en.300.bin")
  # ft = FT_gensim.load("./cc.en.300.bin")
  # ft = fasttext.load_model("./cc.en.300.bin")
  end = time.time()
  duration =  end - start
  print(f'Loading took: {duration} s')
  return ft

def create_spacy_nlp_embeder():
  my_file = Path("./crawl-300d-2M.vec.zip")
  if not my_file.is_file():
    !wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip 
  my_file = Path("./en_vectors_wiki_lg")
  if not my_file.is_file():
    !python -m spacy init-model en /en_vectors_wiki_lg --vectors-loc crawl-300d-2M.vec.zip
  print('Loading fastText vectors into spaCy model into memory, it can take a while...')
  start = time.time()
  nlp = spacy.load("/en_vectors_wiki_lg")
  end = time.time()
  duration =  end - start
  print(f'Loading took: {duration} s')
  return nlp 


In [4]:
nlp = create_spacy_nlp_embeder()

✔ Successfully created model
⠙ Reading vectors from crawl-300d-2M.vec.ziptcmalloc: large alloc 2400002048 bytes == 0x33bc000 @  0x7faf38ea0001 0x7faf36a04765 0x7faf36a68bb0 0x7faf36a6aa4f 0x7faf36b01048 0x50a635 0x50cd96 0x509758 0x50a48d 0x50bfb4 0x507d64 0x509a90 0x50a48d 0x50bfb4 0x507d64 0x588d41 0x59fc4e 0x50d356 0x507d64 0x509a90 0x50a48d 0x50bfb4 0x507d64 0x509a90 0x50a48d 0x50bfb4 0x507d64 0x516345 0x50a2bf 0x50bfb4 0x507d64
1999995it [03:10, 10499.89it/s]
✔ Loaded vectors from crawl-300d-2M.vec.zip
✔ Sucessfully compiled vocab
2000191 entries, 1999995 vectors
Loading fastText vectors into spaCy model into memory, it can take a while...
Loading took: 25.01153588294983 s


### VAT


In [5]:
@contextlib.contextmanager
def _disable_tracking_bn_stats(model):

    def switch_attr(m):
        if hasattr(m, 'track_running_stats'):
            m.track_running_stats ^= True
    model.apply(switch_attr)
    yield
    model.apply(switch_attr)

class VATLoss(nn.Module):

    def __init__(self, xi=10.0, eps=1.0, ip=1):
        """VAT loss
        :param xi: hyperparameter of VAT (default: 10.0)
        :param eps: hyperparameter of VAT (default: 1.0)
        :param ip: iteration times of computing adv noise (default: 1)
        """
        super(VATLoss, self).__init__()

        self.xi = xi
        self.eps = eps
        self.ip = ip

    def forward(self, model, x):
        with torch.no_grad():
            logits = model(x)
            pred = F.softmax(logits, dim=1)

        # prepare random unit tensor

        d = torch.rand(x.shape).sub(0.5).to(x.device)
        def _l2_normalize(d):
          d_reshaped = d.view(d.shape[0], -1, *(1 for _ in range(d.dim() - 2)))
          d /= torch.norm(d_reshaped, dim=1, keepdim=True) + 1e-8
          return d

        d = _l2_normalize(d)
        # self.forward as model it may crush?
        with _disable_tracking_bn_stats(model):
            # calc adversarial direction
          for _ in range(self.ip):
              d.requires_grad_()
              pred_hat = model(x + self.xi * d)
              logp_hat = F.log_softmax(pred_hat, dim=1)
              adv_distance = F.kl_div(logp_hat, pred, reduction='batchmean')
              adv_distance.backward(retain_graph=True)
              d = _l2_normalize(d.grad)
              model.zero_grad()

        # calc LDS
        r_adv = d * self.eps
        pred_hat = model(x + r_adv)
        logp_hat = F.log_softmax(pred_hat, dim=1)
        lds = F.kl_div(logp_hat, pred, reduction='batchmean')

        return lds 

### FixMatch Augmentation

In [6]:
# ---------------------------------------------    
# augmentations on char level 
# ---------------------------------------------    
def print_augmentations(func, text):
  def wrapper():
    print("Augmentation function: ", func.__name__)
    print("Original: ")
    print(text)
    augmented_text = func(text)
    print("Augmention result: ")
    print(augmented_text)
    return augmented_text

  return wrapper() 

def substitute_character_by_keyboard_distance(text):
  aug = nac.KeyboardAug()
  augmented_text = aug.augment(text)
  return augmented_text

def insert_character_randomly(text):
  aug = nac.RandomCharAug(action="insert")
  augmented_text = aug.augment(text)
  return augmented_text 


def substitute_character_randomly(text):
  aug = nac.RandomCharAug(action="substitute")
  augmented_text = aug.augment(text)
  return augmented_text 


def delete_char_randomly(text):
  aug = nac.RandomCharAug(action="delete")
  augmented_text = aug.augment(text)
  return augmented_text 


def swap_character_randomly(text):
  aug = nac.RandomCharAug(action="swap")
  augmented_text = aug.augment(text)
  return augmented_text

# ---------------------------------------------    
# augmentations on word level 
# ---------------------------------------------    
# models - spelling_en.txt
# model_dir with fasttext or word2vec or glove 
# model dir with tf-idf

# its consume to much RAM
# def insert_word_randomly_by_word_embeddings_similarity(text):
#   # model_type: word2vec, glove or fasttext
#   aug = naw.WordEmbsAug(
#       model_type='word2vec', model_path=os.environ('WORD2VEC_MODEL_PATH'),
#       action="insert")
#   augmented_text = aug.augment(text)
#   print("Original:")
#   print(text)
#   print("Augmented Text:")
#   print(augmented_text)
#   return augmented_text


def insert_word_by_tf_idf_similarity(text):
  aug = naw.TfIdfAug(
      model_path=os.environ['NLPAUG_PATH'],
      action="insert")
  augmented_text = aug.augment(text)
  return augmented_text 



def split_word_to_two_tokens_randomly(text):
  aug = naw.SplitAug()
  augmented_text = aug.augment(text)
  return augmented_text 


def swap_word_randomly(text):
  aug = naw.RandomWordAug(action="swap")
  augmented_text = aug.augment(text)
  return augmented_text 


def substitute_word_by_antonym(text):
  aug = naw.AntonymAug()
  augmented_text = aug.augment(text)
  return augmented_text 


def substitute_word_by_spelling_mistake_words_dictionary(text):
  aug = naw.SpellingAug(os.environ['SPELLING_PATH'])
  augmented_text = aug.augment(text, n=1)
  return augmented_text 


def insert_word_by_contextual_word_embeddings(text):
  aug = naw.ContextualWordEmbsAug(
      model_path='bert-base-uncased', action="insert")
  augmented_text = aug.augment(text)
  return augmented_text


def subtitute_word_by_contextual_word_embeddings(text):
  aug = naw.ContextualWordEmbsAug(
           model_path='bert-base-uncased', action="substitute")
  augmented_text = aug.augment(text)
  return augmented_text


def substitute_word_by_WordNets_synonym(text):
  aug = naw.SynonymAug(aug_src='wordnet')
  augmented_text = aug.augment(text)
  return augmented_text
  
def fixmatch_weak_augment_pool():
    augs = [
            substitute_character_by_keyboard_distance,
            insert_character_randomly,
            substitute_character_randomly,
            delete_char_randomly,
            swap_character_randomly,
            # insert_word_randomly_by_word_embeddings_similarity,
            insert_word_by_tf_idf_similarity,
            split_word_to_two_tokens_randomly,
            swap_word_randomly,
            substitute_word_by_antonym,
            substitute_word_by_spelling_mistake_words_dictionary,
            insert_word_by_contextual_word_embeddings,
            subtitute_word_by_contextual_word_embeddings,
            substitute_word_by_WordNets_synonym,
           ]

    return augs

# def fixmatch_strong_augment_pool():
#     augs = [
#             insert_word_by_contextual_word_embeddings,
#             subtitute_word_by_contextual_word_embeddings,
#             substitute_word_by_WordNets_synonym,
#            ]

#     return augs


class WeakRandAugment(object):
  def __init__(self, n, show=False):
    """
    Parameters:
    n (int): number of operations

    """
    assert n >= 0
    self.n = n
    self.augment_pool = fixmatch_weak_augment_pool()
    self.show=show

  def __call__(self, text):
    if self.n <= 0:
      return text
    ops = random.choices(self.augment_pool, k=self.n)
    for op in ops:
        if random.random() < 1.:
          if self.show:
            text = print_augmentations(op, text)
          else:
            text = op(text)
    return text 

# not necessery
# class StrongRandAugment(object):
#   def __init__(self, n, show=False):
#     assert n >= 1
#     self.n = n
#     self.augment_pool = fixmatch_strong_augment_pool()
#     self.show= show

#   def __call__(self, text):
#     ops = random.choices(self.augment_pool, k=self.n)
#     for op in ops:
#       if random.random() < 1.:
#         if self.show:
#           text = print_augmentations(op, text)
#         else:
#           text = op(text)
#     return text 


class TransformFix(object):
  def __init__(self, n_weak=3, show=False):
  # def __init__(self, n_weak=3, n_strong=2, show=False):
    self.weak = WeakRandAugment(n=n_weak, show=show) 
    # self.strong = StrongRandAugment(n=n_strong, show=show)

  def __call__(self, x):
    weak = self.weak(x)
    # strong = self.strong(x)
    return weak #, strong



### Datasets 

In [7]:
class TwoInOneDataset(Dataset):
    def __init__(self, datasets):
        self.datasets = datasets

        self.map_indexes = [[] for _ in self.datasets]

        self.min_length = min(len(d) for d in self.datasets)
        self.max_length = max(len(d) for d in self.datasets)

    def __getitem__(self, i):
        return tuple(d[m[i]] for d, m in zip(self.datasets, self.map_indexes))

    def construct_map_index(self):
        def update_indices(original_indexes, target_len, max_len):
            # map max_len to target_len (large to small)

            # return: a list, which maps the range(max_len) to the valid index in the dataset
            
            original_indexes = original_indexes[max_len:] # remove used indices
            fill_num = max_len - len(original_indexes)
            batch = fill_num // target_len

            if fill_num % target_len != 0:
                # to let the fill_num + len(original_indexes) greater than max_len
                batch += 1

            additional_indexes = list(range(target_len)) * batch
            random.shuffle(additional_indexes)

            original_indexes += additional_indexes

            assert len(original_indexes) >= max_len, "the length of matcing indexes is too small"

            return original_indexes

        self.map_indexes = [update_indices(m, len(d), self.max_length) 
            for m, d in zip(self.map_indexes, self.datasets)]

    def __len__(self):
        # will be called every epoch
        self.construct_map_index()
        return self.max_length
        
class SimpleTextDataset(Dataset):

    def __init__(self, x, y, transform=None):
        self.x = x
        self.y = y
        self.transform = transform

    def __getitem__(self, index):
        # special dict convention for f: process_NLUHD 
        data_dict = { 'text': self.x[index], 'label': self.y[index]}
        if self.transform is not None:
          return self.transform(data_dict)
        return tuple(data_dict.values()) 

    def __len__(self):
        return len(self.x)

class FixMatchAugmentedTextDataset(Dataset):

    def __init__(self, x, x_paraphrases, y,
                 model_preprocessing = None,
                 fix_match_augmentation = None, show=False):
        self.x = x
        self.x_paraphrases = x_paraphrases
        self.y = y
        self.model_preprocessing = model_preprocessing
        self.fix_match_augmentation = fix_match_augmentation
        self.show = show

    def __getitem__(self, index):
        # special dict convention for f: process_NLUHD 
        if self.fix_match_augmentation is not None:
          weak_augmented, strong_augmented = \
           self.fix_match_augmentation(self.x[index]), self.x_paraphrases[index] 
          if self.show:
            def back_translation(text):
              return strong_augmented 
            print_augmentations(back_translation, self.x_paraphrases[index])
        

        weak_aug_data_dict = { 'text': weak_augmented , 'label': self.y[index]}
        strong_aug_data_dict = { 'text': strong_augmented , 'label': self.y[index]}

        if self.model_preprocessing is not None:
          return self.model_preprocessing(weak_aug_data_dict), self.model_preprocessing(strong_aug_data_dict)

        return tuple(weak_aug_data_dict.values()), tuple(strong_aug_data_dict.values()) 

    def __len__(self):
        return len(self.x)

### Datasets related processing

In [8]:
def prepare_NLUHD(comment, nlp,  ner_abstract_tag: bool=True):
  expression = r"\[.*?\]"
  matches = []
  for match in re.finditer(expression, comment.text):
      start, end = match.span()
      span = comment.char_span(start, end)
      # This is a Span object or None if match doesn't map to valid token sequence
      if span is not None:
          # print("Found match:", span.text)
          if ner_abstract_tag:
            expression_scd = r"\[.*?\:"
          else:
            expression_scd = r"\:.*?\]"

          temp_doc = nlp(span.text)
          scd_match = next(re.finditer(expression_scd, temp_doc.text))
          start1, end1 = scd_match.span()
          # print(start1, end1)
          s1 = int(start1) + 1
          e1 = int(end1) - 1
          # print(type(e1))
          replace_str = temp_doc.text[s1:e1].strip()
          # scd_doc = temp_doc.char_span(start1 + 1, end1 - 2) 
          matches += [((start, end), replace_str)]

  start_line = 0
  new_comment = ""
  for match in matches:
    s = match[0][0]
    e = match[0][1]
    replace_word = match[1]
    new_comment += comment.text[start_line:s] + replace_word 
    start_line = e
  new_comment += comment.text[start_line:]
  ret_val = nlp(new_comment)
  return ret_val

def preprocess_NLUHD(lowercase,
                     remove_stopwords,
                     with_ner_tags,
                     nlp,
                     label_encoder,
                     sample):
  
  stops = stopwords.words("english")
  comment = sample['text']
  if lowercase:
      comment = comment.lower()
  comment = nlp(comment)
  if with_ner_tags is True:
    comment = prepare_NLUHD(comment, ner_abstract_tag=True, nlp=nlp)
  else:
    comment = prepare_NLUHD(comment, ner_abstract_tag=False, nlp=nlp)
  lemmatized = list()
  if remove_stopwords:
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in stops):
                lemmatized.append(lemma)
    processed_text = " ".join(lemmatized) 
  processed_text = comment.text
  encoded_label = torch.tensor(int(label_encoder.transform([sample["label"]])))
  return {"text": processed_text,
          "label": encoded_label}

def preprocess_IMDB(label_encoder, sample: Dict):
  pattern1 = re.compile(r'<.*?>')
  # pattern2 = re.compile('[\W_]+ ')
  # text = pattern2.sub(' ', text)
  text = pattern1.sub('', sample['text']).lower()
  encoded_label = torch.tensor(int(label_encoder.transform([sample["label"]])))
  return {"text": text,
          "label": encoded_label}
          
def preprocess_MR(label_encoder, sample: Dict):
  pattern1 = re.compile(r'<.*?>')
  # pattern2 = re.compile('[\W_]+ ')
  # text = pattern2.sub(' ', text)
  text = pattern1.sub('', sample['text']).lower()
  encoded_label = torch.tensor(int(label_encoder.transform([sample["label"]])))
  return {"text": text,
          "label": encoded_label}

### Model related processing

In [9]:
def transformer_preprocessing(model_type: str,
                              MAX_LEN: int,
                              tokenizer: BertTokenizer,
                              sample:Dict,) -> Dict:
  
    inputs = tokenizer.encode_plus(
            sample["text"],
            add_special_tokens=True,
            max_length=MAX_LEN,
            )
    # Output of `tokenizer.encode_plus` is a dictionary.
    if model_type == 'roberta-base':
      token_type_ids = [] 
    else:
      input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
    # For BERT, we need `attention_mask` along with `input_ids` as input.
    attention_mask = [1] * len(input_ids)
    # We are going to pad sequences.
    padding_length = MAX_LEN - len(input_ids)
    pad_id = tokenizer.pad_token_id
    input_ids = input_ids + ([pad_id] * padding_length)
    attention_mask = attention_mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([pad_id] * padding_length)

    assert len(input_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(input_ids), MAX_LEN)
    assert len(attention_mask) == MAX_LEN, "Error with input length {} vs {}".format(len(attention_mask), MAX_LEN)
    assert len(token_type_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(token_type_ids), MAX_LEN)

    # Just a python list to `torch.tensor`
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    token_type_ids = torch.tensor(token_type_ids)

    # What we return will one instance in batch which `LightningModule.train_step` receives.
    return {
            "label": sample['label'],
            "embedding": {
                          "input_ids": input_ids,
                          "attention_mask": attention_mask,
                          "token_type_ids": token_type_ids
                         }
            }

def generate_embeddings(
                         hparams,
                         tokenizer,
                         embeder,
                         sample):

  embedding = torch.Tensor([token.vector for token in embeder(sample["text"])])#torch.tensor(embeder.wv[tokens])

  if embedding.size()[0] >= hparams['max_sentence_len']:
    embedding = torch.narrow(embedding, 0, 0, hparams['max_sentence_len'])
  else:
    padding_length = hparams['max_sentence_len'] - len(embedding)
    padding_vectors = torch.zeros((padding_length, hparams['embed_dim']))
    embedding = torch.cat((embedding, padding_vectors)) 

  return {'label': sample['label'],
          'embedding': embedding}

In [10]:
# datasets_path = str(Path()/'gdrive'/'My Drive'/'praca_magisterska'/'pytorch_lightning'/'datasets')
# NLU_HD_path = os.path.join(datasets_path,'NLU-Data-Home-Domain-Annotated-All.csv')
# print(NLU_HD_path)
# # df = pd.read_csv(str(NLU_HD_path), delimiter=';')[['intent', 'answer_annotation']]
# df = pd.read_csv(str(NLU_HD_path), delimiter=';')[['intent', 'answer_annotation', 'scenario']]
# df['intent'] = df[['scenario', 'intent']].agg('-'.join, axis=1) 
# del df['scenario']
# df = df[df['answer_annotation'].notna()]
# df = df.rename(columns={"answer_annotation": "text"})
# nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])

# df['text'] = df['text'].apply(normalize,
#                               lowercase=True,
#                               remove_stopwords=False,
#                               with_ner_tags=False,
#                               nlp=nlp) 
# # df.to_csv(os.path.join(datasets_path,'NLU-Data-Home-Domain-preprocessed-without-ner_no-scenario.csv'))
# df.to_csv(os.path.join(datasets_path,'NLU-Data-Home-Domain-preprocessed-without-ner.csv'))
# df['intent'].value_counts().plot(kind="bar", figsize= (21,20))

In [11]:
# pd.set_option('display.max_rows', None)
# df['intent'].value_counts()

In [12]:
# df.head(10)

In [13]:
# len(df['intent'].unique())

### Utils

In [14]:
def compute_global_metric(outputs, metric):
    return sum([out[metric] for out in outputs]) / len(outputs)

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def tfidf_tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"):
    token_pattern = re.compile(token_pattern)
    return token_pattern.findall(text)

def create_tfidf_model(df: pd.DataFrame):  
  train_data = df['text'] 
  train_x = train_data.values
    
  train_x_tokens = [tfidf_tokenizer(x) for x in train_x]
    
  tfidf_model = nmw.TfIdf()
  tfidf_model.train(train_x_tokens)
  tfidf_model.save(os.environ['NLPAUG_PATH'])
  os.environ['TFIDF_MODEL_PATH']  = os.path.join(os.environ['NLPAUG_PATH'], 'tfidfaug_w2tfidf.txt')
  os.listdir(os.environ['NLPAUG_PATH'])

class MetricsCallback(Callback):
    """PyTorch Lightning metric callback."""

    def __init__(self):
        super().__init__()
        self.metrics = []

    def on_validation_end(self, trainer, pl_module):
        self.metrics.append(trainer.callback_metrics)

# credits: https://github.com/galatolofederico/pytorch-balanced-batch/blob/master/sampler.py        
class BalancedBatchSampler(torch.utils.data.sampler.Sampler):
    def __init__(self, dataset, labels=None):
        self.labels = labels
        self.dataset = dict()
        self.balanced_max = 0
        # Save all the indices for all the classes
        for idx in range(0, len(dataset)):
            label = self._get_label(dataset, idx)
            if label not in self.dataset:
                self.dataset[label] = list()
            self.dataset[label].append(idx)
            self.balanced_max = len(self.dataset[label]) \
                if len(self.dataset[label]) > self.balanced_max else self.balanced_max
        
        # Oversample the classes with fewer elements than the max
        for label in self.dataset:
            while len(self.dataset[label]) < self.balanced_max:
                self.dataset[label].append(random.choice(self.dataset[label]))
        self.keys = list(self.dataset.keys())
        self.currentkey = 0
        self.indices = [-1]*len(self.keys)

    def __iter__(self):
        while self.indices[self.currentkey] < self.balanced_max - 1:
            self.indices[self.currentkey] += 1
            yield self.dataset[self.keys[self.currentkey]][self.indices[self.currentkey]]
            self.currentkey = (self.currentkey + 1) % len(self.keys)
        self.indices = [-1]*len(self.keys)
    
    def _get_label(self, dataset, idx, labels = None):
        if self.labels is not None:
            return self.labels[idx].item()
        else:
            raise Exception("You should pass the tensor of labels to the constructor as second argument")

    def __len__(self):
        return self.balanced_max*len(self.keys)

### Tests 

#### Check if lf.cv.split splits in stratified fashion

In [15]:
# ds = lf.CsvDataset(os.path.join(os.environ['DATASETS_PATH'], 'NLU-Data-Home-Domain-preprocessed-without-ner.csv'), header=True)
# ds = ds.filter(lambda x: x['text'] is not None)

# train, test = lf.cross_validation.split_dataset_random(ds, int(len(ds) * 0.8), seed=42)
# df_train = pd.DataFrame(train)
# df_test = pd.DataFrame(test)
# # df_train['intent'].value_counts().plot(kind="bar", figsize= (21,20))
# df_test['intent'].value_counts().plot(kind="bar", figsize= (21,20))

In [16]:
# pd.set_option('display.max_rows', None)
# df_train.info()

In [17]:
# nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])
# uter = '[greetings : Hello] there what is [swear_word: fuck] up ?'
# # uter =  '[greetings : Hello] there what is fuck up ?'
# # uter = "hello adfafdasfsaf sfsafdsafsa"
# comment = nlp(uter)
# processed_comment = prepare_NLUHD(comment, nlp=nlp,ner_abstract_tag=False)
# processed_comment

In [18]:
# train = lfds.MsrParaphrase("train")
# print(len(train))
# test = lfds.MsrParaphrase("test")
# train.first()
# def nonefilter(dataset):
#   filtered = []
#   for x in dataset:
#       if x["string1"] is None:
#           continue
#       if x["string2"] is None:
#           continue
#       filtered.append(x)
#   return lf.Dataset(filtered)
# # train = nonefilter(train)
# train = train.filter(lambda x: x["string1"] is not None and x["string2"] is not None)
# print(len(train))
# train.take(3)
# unique = list(['ale', 'beka'])
# le = preprocessing.LabelEncoder().fit(unique)
# torch.tensor(int(le.transform(['ale'])))

#### Check IMDB preprocessing function

In [19]:
# hparams = {'max_sentence_len': 200,
#            'embed_dim': 300,
#            'seed': 42,
#            'train_test_split': 0.8}

# train = lfds.Imdb('train')
# test = lfds.Imdb('test')
# ds = train + test
# ds = ds.map(lambda x: {'text':x[0] , 'label': x[1]})
# embeder = nlp 
# tokenizer_fun = lambda x: x#gensim_tokenizer
# unique_labels = list(pd.DataFrame(ds).label.unique())
# le = preprocessing.LabelEncoder().fit(unique_labels)

# preprocessor = partial(
#                       preprocess_IMDB,
#                       hparams,
#                       tokenizer_fun, 
#                       embeder,
#                       le,
#                       )
# s = ds.first()
# print(s)
# preprocessor(s)



In [20]:

# train = lfds.Imdb('train')
# test = lfds.Imdb('test')
# ds = train + test
# ds = ds.map(lambda x: {'text':x[0] , 'label': x[1]})
# df = pd.DataFrame(ds)
# pattern1 = re.compile(r'<.*?>')
# # pattern2 = re.compile('[\W_]+')
# # text = pattern1.sub('', sample['text'])
# # print('text after p1: ', text)
# # text = text.replace('_', '').lower()
# func = partial(pattern1.sub,
#                '')
# df['text'] = df['text'].apply(func)
# # df
# texts = list(df['text'])
# texts

In [21]:
# ft.build_vocab(texts, update=True)
# ft.train(new_sentences, total_examples=len(texts), epochs=10)

#### Test FixMatchTransform

In [22]:
tf = TransformFix(1,show=True)
s = tf("what will be the weather like tomorrow, please tell me")

Augmentation function:  substitute_word_by_antonym
Original: 
what will be the weather like tomorrow, please tell me
Augmention result: 
what disinherit be the weather unlike tomorrow , please tell me


#### Test balanced sampler

In [23]:
# not_none = lambda x: x["text"] is not None 
# # ds = lf.CsvDataset(self.hparams['dataset_path'], header=True).filter(not_none)
# ds = lfds.Imdb('train') + lfds.Imdb('test')
# ds = ds.map(lambda x: {'text': x[0], 'label': x[1]})
# df = pd.DataFrame(ds)

# create_tfidf_model(df)

# print(df.info(memory_usage=True))
# unique_labels = list(df.label.unique())
# print(f'unique_labels: {unique_labels}')
# print(f'number_of_categories : {len(unique_labels)}')
# le = preprocessing.LabelEncoder().fit(unique_labels)
# train, test = lf.cross_validation.split_dataset_random(ds,
#                                                         int(len(ds) * 0.9),
#                                                         seed=42)
# dataset_preprocessor = partial(
#                                 preprocess_IMDB,
#                                 le,
#                               )

# tokenizer_dict = {
#         "bert-base-uncased":
#           BertTokenizer.from_pretrained("bert-base-uncased",
#                                         do_lower_case=True),
#         "roberta-base":
#           RobertaTokenizer.from_pretrained("roberta-base")
#         }

# model_arch_preprocessor = partial(
#                                   transformer_preprocessing,
#                                   'bert-base-uncased',
#                                   156,
#                                   tokenizer_dict['bert-base-uncased'],
#                                   )


# preprocessor = toolz.compose(
#                             model_arch_preprocessor,
#                             dataset_preprocessor,
#                             )

# train_df, test_df = pd.DataFrame(train), pd.DataFrame(test)
# x_train, y_train = train_df['text'].values, train_df['label'].values
# x_test, y_test = test_df['text'].values, test_df['label'].values

# # split's training parameters  
# num_classes = len(unique_labels)
# label_per_class = 1000 // num_classes
# valid_size = 1000 

# labeled_idx = []
# unlabeled_idx = []
# val_idx = []

# for label in unique_labels:
#   idx = np.where(y_train == label)[0]
#   np.random.shuffle(idx)
#   labeled_idx.extend(idx[:label_per_class])
#   val_idx.extend(idx[label_per_class: label_per_class + valid_size])
#   unlabeled_idx.extend(idx[label_per_class + valid_size:])

# x_labeled, y_labeled  = x_train[labeled_idx], y_train[labeled_idx]
# x_unlabeled, y_unlabeled = x_train[unlabeled_idx], y_train[unlabeled_idx]
# x_val, y_val = x_train[val_idx], y_train[val_idx]


# train_labeled_dataset = SimpleTextDataset(x_labeled,
#                                               y_labeled,
#                                               transform=preprocessor)
# train_unlabeled_dataset = SimpleTextDataset(x_unlabeled,
#                                                   y_unlabeled,
#                                                   transform=preprocessor)

# val_dataset = SimpleTextDataset(x_val,
#                                     y_val,
#                                     transform=preprocessor)

# test_dataset = SimpleTextDataset(x_test,
#                                       y_test,
#                                       transform=preprocessor)



# train_labeled_dataloader = torch.utils.data.DataLoader(
#                       train_labeled_dataset,
#                       batch_size=64,
#                       # shuffle=True,
#                       num_workers=0,
#                       sampler=BalancedBatchSampler(train_labeled_dataset, y_labeled),
#                       )

# train_labeled_dataloader_iterator = iter(train_labeled_dataloader)
# train_unlabeled_dataloader = DataLoader(
#                 train_unlabeled_dataset,
#                 batch_size=64,
#                 num_workers=8,
#                 shuffle=True # without shuffle it want work cause it need to create map index before __get_item__ function
#                 )

# train_labeled_dataloader_iterator = iter(train_labeled_dataloader)
# b = next(train_labeled_dataloader_iterator)
# torch.sum(b['label'])

In [24]:
epochs = 3
size = 20
features = 5
classes_prob = torch.tensor([0.1, 0.4, 0.5])

dataset_X = torch.randn(size, features)
dataset_Y = torch.distributions.categorical.Categorical(classes_prob.repeat(size, 1)).sample()
print(dataset_Y)

dataset = torch.utils.data.TensorDataset(dataset_X, dataset_Y)

train_loader = torch.utils.data.DataLoader(dataset, sampler=BalancedBatchSampler(dataset, dataset_Y), batch_size=6)

for epoch in range(0, epochs):
    for batch_x, batch_y in train_loader:
        print("epoch: %d labels: %s\ninputs: %s\n" % (epoch, batch_y, batch_x))

tensor([2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 0, 1, 2, 1, 2, 2, 1, 1, 2, 2])
epoch: 0 labels: tensor([2, 1, 0, 2, 1, 0])
inputs: tensor([[ 0.9628, -0.1913,  0.0045, -1.3410,  0.0190],
        [ 0.4937, -0.3173, -0.6087, -1.0980,  0.0985],
        [ 1.0681, -1.3758,  1.6920,  1.7767,  0.5419],
        [ 0.4730,  1.5880, -1.1072,  0.3187,  2.3598],
        [ 1.4137,  1.5612,  1.4698, -0.0703,  1.2697],
        [ 1.0681, -1.3758,  1.6920,  1.7767,  0.5419]])

epoch: 0 labels: tensor([2, 1, 0, 2, 1, 0])
inputs: tensor([[-0.2268,  0.8844, -0.5976,  0.4165,  0.2378],
        [-0.0781,  0.7962, -2.2511, -0.2486, -0.5923],
        [ 1.0681, -1.3758,  1.6920,  1.7767,  0.5419],
        [ 0.2992,  0.6629,  1.1344, -0.4396,  1.6329],
        [ 0.1519,  2.5613, -0.6568, -0.1965, -0.4150],
        [ 1.0681, -1.3758,  1.6920,  1.7767,  0.5419]])

epoch: 0 labels: tensor([2, 1, 0, 2, 1, 0])
inputs: tensor([[-0.1185,  0.5219,  0.9298, -0.8324,  1.2911],
        [ 0.6324,  1.6372, -0.2934, -0.6020,  0.6133],
 

#### Test embeder if exist

In [25]:
# tokens = nlp('213213dsf ma kota')
# tokenlist = [token.vector for token in tokens]
# t = torch.Tensor(tokenlist)
# t

### Composable Framework 

In [26]:
class LitComposableFramework(pl.LightningModule):

  def __init__(self, hparams):

    super().__init__()
    self.hparams = hparams
    self.num_classes = hparams['num_classes']
    self.total_iterations = 0 
    self.loss_fct = getattr(nn, hparams['loss_function'])()


    if self.hparams['model_arch'] == "Convolution":

      self.embeder_dict = {
                          'fastText': (create_ft_embeder, gensim_tokenizer),
                          'spaCy':(create_spacy_nlp_embeder, lambda x: x)
                          }
      embeder, self.tokenizer_fun = self.embeder_dict[hparams['embeder_type']]
      self.embeder = nlp #embeder#nlp # hardcoded
      self.D = hparams['embed_dim']
      self.Ci = hparams['Ci'] 
      self.Co = hparams['kernel_num']
      self.Ks = list(map(int, hparams['kernel_sizes'].split(','))) # (3,4,5)
      self.convs1 = nn.ModuleList([nn.Conv2d(self.Ci, self.Co, (K, self.D)) for K in self.Ks])
      self.dropout = nn.Dropout(hparams['dropout'])
      self.fc1 = nn.Linear(len(self.Ks) * self.Co, self.num_classes) 


    elif self.hparams['model_arch'] == "Transformer":
      self.model_class_dict = {
            "bert-base-uncased": BertModel,
            "roberta-base": RobertaModel
            }
              
      self.tokenizer_dict = {
              "bert-base-uncased":
                BertTokenizer.from_pretrained("bert-base-uncased",
                                              do_lower_case=True),
              "roberta-base":
                RobertaTokenizer.from_pretrained("roberta-base")
              }
  
      self.model = self.model_class_dict[self.hparams['model_type']].from_pretrained(self.hparams['model_type'],
                                                                output_attentions=True)
      self.encoder_features =  self.model.config.hidden_size 
      print(self.encoder_features)
      self.num_classes = self.hparams['num_classes']
      self.classification_head = nn.Sequential(
            nn.Linear(self.encoder_features, self.encoder_features * 2),
            nn.Tanh(),
            nn.Linear(self.encoder_features * 2, self.encoder_features),
            nn.Tanh(),
            nn.Linear(self.encoder_features, self.num_classes),
        )
      
    else:
      raise ValueError('Wrong model architecture type: {} \n Possible datasets: Transformer, Convolution'.format(self.hparams['model_arch']))

  def forward(self, x, embeddings_only=False):

      if self.hparams['model_arch'] == "Convolution":

        if embeddings_only == True:
          logits = x
        else:
          x = x.unsqueeze(self.Ci)  # (N, Ci, W, D)
          x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)
          x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)
          x = torch.cat(x, 1)
          x = self.dropout(x)  # (N, len(Ks)*Co)
          logits = self.fc1(x)  # (N, C)

      elif self.hparams['model_arch'] == "Transformer":
        # print('input_x: ', x['input_ids'].size())
        h, _, _ = self.model(x['input_ids'],
                                attention_mask=x['attention_mask'],
                                token_type_ids=x['token_type_ids'] if self.hparams['model_type'] != "roberta-base" else None)
        h_cls = h[:, 0]

        if embeddings_only == True:
          return h_cls
        # print('h_cls', h_cls.size())
        logits = self.classification_head(h_cls)
      else:
        raise ValueError('Wrong model architecture type: {} \n Possible datasets: Transformer, Convolution'.format(self.hparams['model_arch']))

      return logits


  def prepare_data(self):

    if self.hparams['dataset'] == 'NLUHD':

      not_none = lambda x: x["text"] is not None 
      ds = lf.CsvDataset(self.hparams['dataset_path'], header=True).filter(not_none)
      unique_labels = list(pd.DataFrame(ds).intent.unique())
      self.le = preprocessing.LabelEncoder().fit(unique_labels)
      print(f"Unique labels: {unique_labels}")
      print(f"Number of unique labels: {len(unique_labels)}")
      train, test = lf.cross_validation.split_dataset_random(ds,
                                                            int(len(ds) * self.hparams['train_test_split']),
                                                            seed=self.hparams['seed'])

      nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])

      dataset_preprocessor = partial(preprocess_NLUHD,
                                     lowercase=True,
                                     remove_stopwords=True,
                                     with_ner_tags=False,
                                     nlp=nlp,
                                     label_encoder=le,
      )

    elif self.hparams['dataset'] == 'MR':

      not_none = lambda x: x["text"] is not None 
      ds = lf.CsvDataset(self.hparams['dataset_path'], header=True).filter(not_none)
      df = pd.DataFrame(ds)

      if self.hparams['training_method'] == "FixMatch":
        create_tfidf_model(df)

      unique_labels = list(df.intent.unique())
      self.le = preprocessing.LabelEncoder().fit(unique_labels)
      print(f"Unique labels: {unique_labels}")
      print(f"Number of unique labels: {len(unique_labels)}")
      train, test = lf.cross_validation.split_dataset_random(ds,
                                                            int(len(ds) * self.hparams['train_test_split']),
                                                            seed=self.hparams['seed'])

      dataset_preprocessor = partial(
                                     preprocess_MR,
                                     le,
                                    )
      
    elif self.hparams['dataset'] == 'IMDB':
      not_none = lambda x: x["text"] is not None 
      ds = lf.CsvDataset(self.hparams['dataset_path'], header=True).filter(not_none)
      # ds = lfds.Imdb('train') + lfds.Imdb('test')
      # ds = ds.map(lambda x: {'text': x[0], 'label': x[1]})
      df = pd.DataFrame(ds)

      if self.hparams['training_method'] == "FixMatch":
        create_tfidf_model(df)

      print(df.info(memory_usage=True))
      unique_labels = list(df.label.unique())
      print(f'unique_labels: {unique_labels}')
      print(f'number_of_categories : {len(unique_labels)}')
      self.le = preprocessing.LabelEncoder().fit(unique_labels)
      train, test = lf.cross_validation.split_dataset_random(ds,
                                                             int(len(ds) * self.hparams['train_test_split']),
                                                             seed=self.hparams['seed'])
      dataset_preprocessor = partial(
                                     preprocess_IMDB,
                                     self.le,
                                    )
                            
    else:
      raise ValueError('Wrong dataset name : {} \n Possible datasets: IMDB, NLUHD'.format(self.hparams['dataset']))

    if self.hparams['model_arch'] == 'Transformer':

      model_arch_preprocessor = partial(
                                        transformer_preprocessing,
                                        self.hparams['model_type'],
                                        self.hparams['max_sentence_len'],
                                        self.tokenizer_dict[self.hparams['model_type']],
                                       )
      
    elif self.hparams['model_arch'] == 'Convolution':

      model_arch_preprocessor = partial(
                                        generate_embeddings,
                                        self.hparams,
                                        self.tokenizer_fun,
                                        self.embeder,
                                       )
    
    else:
      raise ValueError('Wrong model architecture type: {} \n Possible architectures: Convolution, Transformer'.format(self.hparams['model_arch']))
      
    preprocessor = toolz.compose(
                                 model_arch_preprocessor,
                                 dataset_preprocessor,
                                 )

    train_df, test_df = pd.DataFrame(train), pd.DataFrame(test)
    x_train, y_train = train_df['text'].values, train_df['label'].values
    self.x_test, self.y_test = test_df['text'].values, test_df['label'].values

    # split's training parameters  
    num_classes = len(unique_labels)
    label_per_class = self.hparams['n_labeled'] // num_classes
    valid_size = self.hparams['valid_size_per_class']

    labeled_idx = []
    unlabeled_idx = []
    val_idx = []
    
    for label in unique_labels:
        idx = np.where(y_train == label)[0]
        np.random.shuffle(idx)
        labeled_idx.extend(idx[:label_per_class])
        val_idx.extend(idx[label_per_class: label_per_class + valid_size])
        unlabeled_idx.extend(idx[label_per_class + valid_size:])

    self.x_labeled, self.y_labeled  = x_train[labeled_idx], y_train[labeled_idx]
    self.x_unlabeled, self.y_unlabeled = x_train[unlabeled_idx], y_train[unlabeled_idx]
    self.x_val, self.y_val = x_train[val_idx], y_train[val_idx]

    
    self._train_labeled_dataset = SimpleTextDataset(self.x_labeled,
                                                    self.y_labeled,
                                                    transform=preprocessor)
    if self.hparams['training_method'] == 'FixMatch':
      x_unlabeled_paraphrases = train_df['paraphrases'].values
      self._train_unlabeled_dataset = \
         FixMatchAugmentedTextDataset(self.x_unlabeled,
                                      x_unlabeled_paraphrases[unlabeled_idx],
                                      self.y_unlabeled,
                                      model_preprocessing=preprocessor,
                                      show=self.hparams['show_augmentation'],
                                      fix_match_augmentation=TransformFix(
                                        n_weak=self.hparams['n_weak'],
                                        # n_strong=self.hparams['n_strong'],
                                        show=self.hparams['show_augmentation']),
                                     )
    else:
      self._train_unlabeled_dataset = SimpleTextDataset(self.x_unlabeled,
                                                        self.y_unlabeled,
                                                        transform=preprocessor)
    
    self._val_dataset = SimpleTextDataset(self.x_val,
                                          self.y_val,
                                          transform=preprocessor)
    
    self._test_dataset = SimpleTextDataset(self.x_test,
                                           self.y_test,
                                           transform=preprocessor)
    
    self.total_iterations = len(self._train_unlabeled_dataset) // self.hparams['unl_batch_size'] 


  def train_dataloader(self):

    encoded_label = torch.tensor(self.le.transform(self.y_labeled))
    self._train_labeled_dataloader = torch.utils.data.DataLoader(
                            self._train_labeled_dataset,
                            batch_size=self.hparams['l_batch_size'],
                            # shuffle=True,
                            num_workers=0,
                            sampler=BalancedBatchSampler(self._train_labeled_dataset,
                                                         encoded_label),
                            )
      
    self.train_labeled_dataloader_iterator = iter(self._train_labeled_dataloader)
    return DataLoader(
                      self._train_unlabeled_dataset,
                      batch_size=self.hparams['unl_batch_size'],
                      num_workers=8,
                      shuffle=True # without shuffle it want work cause it need to create map index before __get_item__ function
                     )
    
  
  def val_dataloader(self):
    return DataLoader(
                      self._val_dataset,
                      batch_size=self.hparams['val_batch_size'],
                      num_workers=8
                     )
    
  
  def test_dataloader(self):
    return DataLoader(
                      self._test_dataset,
                      batch_size=self.hparams['test_batch_size'],
                      num_workers=8
                     )
    
  
  def configure_optimizers(self):

    if self.hparams['model_arch'] == 'Transformer':
      param_optimizer = list(self.model.named_parameters())
      no_decay = ["bias", 'LayerNorm.weight']
      optimizer_grouped_parameters = [
              {
                  "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                  "weight_decay_rate": 0.01
                  },
              {
                  "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                  "weight_decay_rate": 0.0
                  },
              ]
      print('total_iterations: ', self.total_iterations)
      optimizer = AdamW(
                        optimizer_grouped_parameters,
                        lr=self.hparams['lr'],
                      )
      scheduler = get_linear_schedule_with_warmup(optimizer,
                                                  self.hparams['warmup_steps'],
                                                  self.total_iterations,
                                                  -1)

    elif self.hparams['model_arch'] == 'Convolution':
      optimizer_dict = {'Adam': torch.optim.Adam(self.parameters(),
                                                lr=self.hparams['lr']),
                      }
  
      optimizer = optimizer_dict[self.hparams['optimizer_type']]
                    
      scheduler_dict = {'ExponentialLR': torch.optim.lr_scheduler.ExponentialLR(
                                                          optimizer=optimizer,
                                                          gamma=0.9),
                        'StepLR': torch.optim.lr_scheduler.StepLR(optimizer,
                                              self.hparams['decay_step_size'],
                                              self.hparams['decay_gamma']),
                        'None': None,
                        
                      }

      scheduler = scheduler_dict[self.hparams['sheduler_type']]
      if scheduler == None:
        return [optimizer]

    return [optimizer], [scheduler]

  def supervised(self, texts, labels, logs):

    logits = self.forward(texts)
    loss = self.loss_fct(logits, labels)
    return logits, loss, logs

  def vat(self, l_texts, labels, unl_texts, logs):

    vat_loss = VATLoss(xi=self.hparams['xi'],
                        eps=self.hparams['eps'],
                        ip=self.hparams['ip'])
    
    logits = self.forward(l_texts)
    supervised_loss = self.loss_fct(logits, labels)
    logs.update({'supervised_loss': supervised_loss})

    embeddings = self.forward(unl_texts, embeddings_only=True)
    lds = vat_loss(self, embeddings)
    logs.update({'lds_loss': lds})

    loss = supervised_loss + self.hparams['alpha'] * lds  

    return logits, loss, logs

  def fixmatch(self, l_embeddings, labels, unl_embeddings, logs):

    unl_w_dict, unl_s_dict = unl_embeddings
    unl_w, unl_s = unl_w_dict['embedding'], unl_s_dict['embedding']

    if self.hparams['model_arch'] == 'transformer':

      stacked_inputs_ids = torch.cat((l_embeddings["input_ids"],
                                      unl_w['input_ids'],
                                      unl_s['input_ids']))
      
      stacked_attention_mask = torch.cat((l_embeddings["attention_mask"],
                                          unl_w['attention_mask'],
                                          unl_s['attention_mask']))
      
      stacked_token_type_ids = torch.cat((l_embeddings["token_type_ids"],
                                          unl_w['token_type_ids'],
                                          unl_s['token_type_ids']))
    
      x = {"inputs_ids": stacked_inputs_ids,
          "attention_mask": stacked_attention_mask,
          "token_type_ids": stacked_token_type_ids}
    else:
      x = torch.cat((l_embeddings,
                     unl_w,
                     unl_s))

    logits = self.forward(x)
    batch_size = self.hparams['l_batch_size'] 
    logits_x = logits[:batch_size]
    logits_u_w, logits_u_s = logits[batch_size:].chunk(2)
    del logits
     

    Lx = F.cross_entropy(logits_x, labels, reduction='mean')

    pseudo_label = torch.softmax(logits_u_w.detach_(), dim=-1)
    max_probs, targets_u = torch.max(pseudo_label, dim=-1)
    print('unl_max_probs', max_probs)
    print('unl_max_max_probs', torch.max(max_probs))

    logs.update({'unl_max_confident': torch.max(max_probs),
                 'unl_min_confident': torch.min(max_probs),
                 'unl_mean_confident': torch.mean(max_probs),
                 'unl_std_confident': torch.std(max_probs),
                 })
    mask = max_probs.ge(self.hparams['threshold']).float()
    Lu = (F.cross_entropy(logits_u_s, targets_u,
                          reduction='none') * mask).mean()

    loss = Lx + self.hparams['lambda_u'] * Lu

    logs.update({"Lu": Lu})
    logs.update({"Lx": Lx})

    return logits_x, loss, logs


  def training_step(self, batch, batch_idx):

    try:
      labeled = next(self.train_labeled_dataloader_iterator)

    except StopIteration:
      self.train_labeled_dataloader_iterator = iter(self._train_labeled_dataloader)
      labeled = next(self.train_labeled_dataloader_iterator)

    l_texts, labels = labeled['embedding'], labeled['label']

    if self.hparams['model_arch'] == 'transformer':
      gpu_l_texts = toolz.dicttoolz.valmap(torch.Tensor.cuda, l_texts)
    else:
      gpu_l_texts = l_texts.cuda() 

    gpu_labels = labels.cuda()  
    logs = dict()
    
    if self.hparams['training_method'] == 'VAT':
      unl_embeddings = batch['embedding'] # dont need label
      logits, loss, logs = self.vat(gpu_l_texts, gpu_labels, unl_embeddings, logs)
    elif self.hparams['training_method'] == "FixMatch":
      logits, loss, logs = self.fixmatch(gpu_l_texts, gpu_labels, batch, logs)
    elif self.hparams['training_method'] == "Supervised":
      logits, loss, logs = self.supervised(gpu_l_texts, gpu_labels, logs)
    else:
      raise ValueError('Wrong training method type: {} \n Possible methods : VAT, FixMatch, Supervised'.format(self.hparams['model_arch']))

    probabilities = torch.softmax(logits.detach_(), dim=-1)
    max_probs, labels_hat = torch.max(probabilities, dim=-1)

    print('l_max_probs: ', max_probs)
    print('l_max_max_probs: ', torch.max(max_probs))
    print('labels_hat: ', labels_hat)

    logs.update({'l_max_confident': torch.max(max_probs),
                 'l_min_confident': torch.min(max_probs),
                 'l_mean_confident': torch.mean(max_probs),
                 'l_std_confident': torch.std(max_probs),
                 })

    # labels_hat = logits.max(dim=1)[1]

    labels = gpu_labels.cpu()
    labels_hat = labels_hat.cpu()
    
    accuracy_error = 1 - accuracy_score(labels, labels_hat)
    f1_error = 1 - f1_score(labels, labels_hat, average='micro')
    recall_error = 1 - recall_score(labels, labels_hat, average='micro')
    precision_error = 1 - precision_score(labels, labels_hat, average='micro')

    logs.update({'train_loss': loss,
            'train_accuracy_error': accuracy_error,
            'train_f1_error': f1_error,
            'train_recall_error': recall_error,
            'train_precision_error': precision_error,
           })  

    return {'loss': loss,
            'log': logs}


  def validation_step(self, batch, batch_idx):

    embeddings = batch['embedding']
    labels = batch['label']
  
    logits = self.forward(embeddings)
    loss = self.loss_fct(logits, labels)
    labels_hat = torch.argmax(logits, dim=1)

    labels = labels.cpu()
    labels_hat = labels_hat.cpu()

    accuracy_error = 1 - accuracy_score(labels, labels_hat)
    f1_error = 1 - f1_score(labels, labels_hat, average='micro')
    recall_error = 1 - recall_score(labels, labels_hat, average='micro')
    precision_error = 1 - precision_score(labels, labels_hat, average='micro')
  
    output = {
            "val_loss": loss,
            'accuracy_error': accuracy_error,
            'f1_error': f1_error,
            'recall_error': recall_error,
            'precision_error': precision_error,
            }
  
    return output


  def validation_epoch_end(self, outputs):

    val_acc = compute_global_metric(outputs, 'accuracy_error') 
    val_f1 = compute_global_metric(outputs, 'f1_error')
    val_recall = compute_global_metric(outputs, 'recall_error')
    val_precision = compute_global_metric(outputs, 'precision_error')
    val_loss = compute_global_metric(outputs, "val_loss")

    tqdm_dict = {
                 "val_loss": val_loss,
                 "val_acc_error": val_acc,
                 "val_f1_error": val_f1,
                 "val_recall_error": val_recall,
                 "val_precision_error": val_precision,
                }
    return {
            "progress_bar": tqdm_dict,
            "log": tqdm_dict,
            "val_loss": val_loss,
            'val_acc_error': val_acc,
            'val_f1_error': val_f1
           }


  def test_step(self, batch, batch_idx):

    embeddings = batch["embedding"]
    labels = batch["label"]
  
    logits = self.forward(embeddings)
    loss = self.loss_fct(logits, labels)
    labels_hat = torch.argmax(logits, dim=1)

    labels = labels.cpu()
    labels_hat = labels_hat.cpu()

    accuracy_error = 1 - accuracy_score(labels, labels_hat)
    f1_error = 1 - f1_score(labels, labels_hat, average='micro')
    recall_error = 1 - recall_score(labels, labels_hat, average='micro')
    precision_error = 1 - precision_score(labels, labels_hat, average='micro')
  
    return {
            "test_loss": loss,
            'accuracy_error': accuracy_error,
            'f1_error': f1_error,
            'recall_error': recall_error,
            'precision_error': precision_error,
           }


  def test_epoch_end(self, outputs):

    test_acc = compute_global_metric(outputs, 'accuracy_error') 
    test_f1 = compute_global_metric(outputs, 'f1_error')
    test_recall = compute_global_metric(outputs, 'recall_error')
    test_precision = compute_global_metric(outputs, 'precision_error')
    test_loss = compute_global_metric(outputs, "test_loss")

    tqdm_dict = {
                 "test_loss": test_loss,
                 "test_acc_error": test_acc,
                 "test_f1_error": test_f1,
                 "test_recall_error": test_recall,
                 "test_precision_error": test_precision,
                }
    return {
            "progress_bar": tqdm_dict,
            "log": tqdm_dict,
            "test_loss": test_loss,
            'test_acc_error': test_acc,
            'test_f1_error': test_f1
           }

### Configure experiment


##### Choose dataset

In [27]:
dataset = 'IMDB'

if dataset == 'IMDB':
  dataset_path = str(Path()/
                        'gdrive'/
                        'My Drive'/
                        'praca_magisterska'/
                        'pytorch_lightning'/
                        'datasets'/
                        'imdb_with_bt.csv')
  
  hparams = {"dataset": 'IMDB',
            "num_classes": 2,
            "dataset_path": dataset_path,
            }


elif dataset == 'MR':
  dataset_path = str(Path()/
                        'gdrive'/
                        'My Drive'/
                        'praca_magisterska'/
                        'pytorch_lightning'/
                        'datasets'/
                        'mr_with_bt.csv')
  
  hparams = {"dataset": 'MR',
            "num_classes": 2,
            "dataset_path": dataset_path,
            }

elif dataset == 'NLUHD':
  dataset_path = str(Path()/
                    'gdrive'/
                    'My Drive'/
                    'praca_magisterska'/
                    'pytorch_lightning'/
                    'datasets'/
                    'NLU-Data-Home-Domain-preprocessed-without-ner.csv')
  
  hparams = {"dataset": 'NLUHD',
            "num_classes": 68, # ???
            "dataset_path": dataset_path}
            
else:
  raise ValueError('Wrong dataset name : {} \
   \n Possible datasets: IMDB, MR, NLUHD'.format(hparams['dataset']))


##### Choose traning method

In [28]:
training_method = 'VAT' 

if training_method == 'VAT':
  hparams.update({"training_method": 'VAT',
                  'xi':1e-05,
                  'eps':4.5, 
                  'ip':1, 
                  'alpha':1} 
                )
elif training_method == 'FixMatch':
  hparams.update({"training_method": 'FixMatch',
                  'mu': 4, 
                  'threshold': 0.67,
                  'lambda_u': 1,
                  'n_weak': 0,
                  'show_augmentation': False,
                })

elif training_method == 'Supervised':
  hparams.update({"training_method": 'Supervised'})

else:
  raise ValueError('Wrong training method type: {} \n" \
       Possible methods : VAT, FixMatch, Supervised'.format(self.hparams['model_arch']))

#### Choose model architecture

In [29]:
model_arch = 'Convolution'

if model_arch == "Transformer":
  hparams.update({
      'model_arch': "Transformer",
      'model_type': 'bert-base-uncased',
      'max_sentence_len': 156,

      'xi': 0.00001,
      'lr': 5e-05,
      'weight_decay': 0.01,
      'adam_eps': 1e-06,
      'warmup_steps': 150,
      })

elif model_arch == 'Convolution':
  hparams.update({
           'model_arch': "Convolution",
           'max_sentence_len': 400,
           'embeder_type': "fastText",
           'embed_dim': 300,
           'Ci': 1,
           'kernel_num': 100,
           'kernel_sizes': '3,4,5',
           'dropout':0.5, 

           'optimizer_type': 'Adam',
           'sheduler_type': 'None',
           'lr': 1e-05,
           'decay_step_size': 10000,
           'decay_gamma':0.5,

  })
else:
  raise ValueError('Wrong model architecture type: {} \
   \n Possible datasets: Transformer, Convolution'.format(self.hparams['model_arch']))


#### Choose training params

In [30]:
 hparams.update({
           'train_test_split': 0.9,
           'seed': 42,
           'l_batch_size': 16,
           'unl_batch_size': 32,
           'val_batch_size': 16,
           'test_batch_size': 16,
           'n_labeled': 160, # number of labeled samples  # must be a multiplication of l_batch_size
           'valid_size_per_class': 1000, # 68 class => n_val_samples = 68 * 10 

           'loss_function':'CrossEntropyLoss',

           'test_run': False,
           'max_epochs': 5,
           'min_epochs': 1,
           'val_check_interval': 0.2, 
           'patience': 15, # early stopping callback parameter
     
 }) 

 
tags = []
tags.append(hparams['dataset'])
tags.append(hparams['model_arch'])
tags.append(hparams['training_method'])
tags.append(hparams['n_labeled'])


### Run experiment

In [None]:
set_seed(hparams['seed'])
experiment_name = training_method + "-" + model_arch + "-" + dataset 
neptune_logger = NeptuneLogger(
                               project_name="m1f1/lightning-exps-text",
                               close_after_fit=False,
                               experiment_name=experiment_name,  # Optional,
                               params=hparams, # Optional,
                               tags=tags # Optional,
                              )
# callbacks
early_stop_callback = EarlyStopping(
                        monitor="val_loss",
                        min_delta=0.0,
                        patience=hparams['patience'],
                        verbose=True,
                        mode='min'
                      )
# Path("./checkpoints").mkdir(parents=True, exist_ok=True)
# model_checkpoint = pl.callbacks.ModelCheckpoint(filepath='./checkpoints') # check if it overwrite last checkpoint

# training and evaluating model
trainer = pl.Trainer(
                gpus=1,
                logger=neptune_logger,
                # checkpoint_callback=model_checkpoint,
                # early_stop_callback=early_stop_callback,
                val_check_interval=hparams['val_check_interval'],
                # distributed_backend=hparams['distributed_backend'],
                # default_root_dir="./test_run_logs",
                fast_dev_run=hparams['test_run'],
              #  train_percent_check=0.001,
              #  val_percent_check=0.001,
                min_epochs=hparams['min_epochs'],
                max_epochs=hparams['max_epochs'],
          )

model = LitComposableFramework(hparams)
[print(f'{k}: {v}') for k, v in hparams.items()]
    
trainer.fit(model)
trainer.test(model)

# list_of_files = glob.glob('./checkpoints/*') # * means all if need specific format then *.csv
# latest_file = max(list_of_files, key=os.path.getctime)
# print(latest_file)
# model = LitBERT.load_from_checkpoint(latest_file))


# neptune_logger.experiment.log_artifact('./checkpoints')
neptune_logger.experiment.log_artifact(os.environ['REQUIREMENTS_PATH'])
neptune_logger.experiment.stop()

https://ui.neptune.ai/m1f1/lightning-exps-text/e/LIG-407


NeptuneLogger will work in online mode
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]


dataset: IMDB
num_classes: 2
dataset_path: gdrive/My Drive/praca_magisterska/pytorch_lightning/datasets/imdb_with_bt.csv
training_method: VAT
xi: 1e-05
eps: 4.5
ip: 1
alpha: 1
model_arch: Convolution
max_sentence_len: 400
embeder_type: fastText
embed_dim: 300
Ci: 1
kernel_num: 100
kernel_sizes: 3,4,5
dropout: 0.5
optimizer_type: Adam
sheduler_type: None
lr: 1e-05
decay_step_size: 10000
decay_gamma: 0.5
train_test_split: 0.9
seed: 42
l_batch_size: 16
unl_batch_size: 32
val_batch_size: 16
test_batch_size: 16
n_labeled: 160
valid_size_per_class: 1000
loss_function: CrossEntropyLoss
test_run: False
max_epochs: 5
min_epochs: 1
val_check_interval: 0.2
patience: 15
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0                50000 non-null  object
 1   Unnamed: 0   50000 non-null  object
 2   text         50000 non-null  object
 3   label        50


  | Name     | Type             | Params
----------------------------------------------
0 | loss_fct | CrossEntropyLoss | 0     
1 | convs1   | ModuleList       | 360 K 
2 | dropout  | Dropout          | 0     
3 | fc1      | Linear           | 602   


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

l_max_probs:  tensor([0.5799, 0.5593, 0.5870, 0.5965, 0.5116, 0.5077, 0.5646, 0.5345, 0.5434,
        0.5755, 0.5262, 0.5643, 0.5910, 0.5719, 0.5117, 0.5044],
       device='cuda:0')
l_max_max_probs:  tensor(0.5965, device='cuda:0')
labels_hat:  tensor([0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1], device='cuda:0')
l_max_probs:  tensor([0.5398, 0.5551, 0.5577, 0.5856, 0.5248, 0.5238, 0.5306, 0.5193, 0.5577,
        0.5291, 0.5726, 0.5181, 0.5062, 0.5400, 0.5947, 0.5018],
       device='cuda:0')
l_max_max_probs:  tensor(0.5947, device='cuda:0')
labels_hat:  tensor([1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], device='cuda:0')
l_max_probs:  tensor([0.5197, 0.5563, 0.5045, 0.6113, 0.5701, 0.6222, 0.5291, 0.6146, 0.5075,
        0.5106, 0.5072, 0.5124, 0.5326, 0.5210, 0.5183, 0.5891],
       device='cuda:0')
l_max_max_probs:  tensor(0.6222, device='cuda:0')
labels_hat:  tensor([0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1], device='cuda:0')
l_max_probs:  tensor([0.5471, 0.5660, 0.

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

l_max_probs:  tensor([0.5444, 0.5284, 0.5761, 0.5510, 0.5071, 0.5240, 0.5418, 0.5800, 0.5945,
        0.5476, 0.5445, 0.5199, 0.5578, 0.5374, 0.5739, 0.5819],
       device='cuda:0')
l_max_max_probs:  tensor(0.5945, device='cuda:0')
labels_hat:  tensor([1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0], device='cuda:0')
l_max_probs:  tensor([0.5880, 0.6009, 0.5263, 0.5178, 0.5075, 0.6494, 0.5191, 0.5087, 0.5474,
        0.5305, 0.5319, 0.5637, 0.5218, 0.5161, 0.6156, 0.5713],
       device='cuda:0')
l_max_max_probs:  tensor(0.6494, device='cuda:0')
labels_hat:  tensor([1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0], device='cuda:0')
l_max_probs:  tensor([0.5126, 0.6170, 0.5540, 0.5062, 0.5452, 0.5626, 0.5096, 0.5534, 0.5848,
        0.5160, 0.5376, 0.5846, 0.5284, 0.5837, 0.6461, 0.5418],
       device='cuda:0')
l_max_max_probs:  tensor(0.6461, device='cuda:0')
labels_hat:  tensor([0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0], device='cuda:0')
l_max_probs:  tensor([0.6429, 0.5163, 0.

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

l_max_probs:  tensor([0.5541, 0.5061, 0.6527, 0.5618, 0.5110, 0.5223, 0.5066, 0.6078, 0.5446,
        0.5930, 0.5042, 0.5087, 0.5111, 0.5025, 0.5604, 0.5020],
       device='cuda:0')
l_max_max_probs:  tensor(0.6527, device='cuda:0')
labels_hat:  tensor([1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1], device='cuda:0')
l_max_probs:  tensor([0.5733, 0.5110, 0.5791, 0.5415, 0.5512, 0.5960, 0.5417, 0.5459, 0.5914,
        0.5300, 0.5023, 0.5404, 0.5184, 0.5093, 0.5049, 0.5309],
       device='cuda:0')
l_max_max_probs:  tensor(0.5960, device='cuda:0')
labels_hat:  tensor([1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0], device='cuda:0')
l_max_probs:  tensor([0.5262, 0.5589, 0.5016, 0.5774, 0.5324, 0.5621, 0.5280, 0.5016, 0.6004,
        0.6203, 0.5884, 0.5229, 0.5805, 0.5222, 0.5348, 0.5111],
       device='cuda:0')
l_max_max_probs:  tensor(0.6203, device='cuda:0')
labels_hat:  tensor([0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0], device='cuda:0')
l_max_probs:  tensor([0.5567, 0.5974, 0.

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

l_max_probs:  tensor([0.6018, 0.5480, 0.6039, 0.6440, 0.5283, 0.5271, 0.6332, 0.5578, 0.5637,
        0.5977, 0.5700, 0.6054, 0.5033, 0.5599, 0.5370, 0.5964],
       device='cuda:0')
l_max_max_probs:  tensor(0.6440, device='cuda:0')
labels_hat:  tensor([1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0], device='cuda:0')
l_max_probs:  tensor([0.5133, 0.5464, 0.5973, 0.5468, 0.5065, 0.5678, 0.5374, 0.6206, 0.5225,
        0.5098, 0.5844, 0.5926, 0.5226, 0.5283, 0.6389, 0.5042],
       device='cuda:0')
l_max_max_probs:  tensor(0.6389, device='cuda:0')
labels_hat:  tensor([0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1], device='cuda:0')
l_max_probs:  tensor([0.6602, 0.6147, 0.5688, 0.5415, 0.6449, 0.6003, 0.6362, 0.5772, 0.5687,
        0.5588, 0.6457, 0.5357, 0.6318, 0.6092, 0.5373, 0.5699],
       device='cuda:0')
l_max_max_probs:  tensor(0.6602, device='cuda:0')
labels_hat:  tensor([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0], device='cuda:0')
l_max_probs:  tensor([0.6169, 0.5407, 0.

Exception in thread Thread-9:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.6/dist-packages/neptune/internal/channels/channels_values_sender.py", line 121, in run
    self._process_batch()
  File "/usr/local/lib/python3.6/dist-packages/neptune/internal/channels/channels_values_sender.py", line 129, in _process_batch
    self._send_values(self._values_batch)
  File "/usr/local/lib/python3.6/dist-packages/neptune/internal/channels/channels_values_sender.py", line 154, in _send_values
    self._experiment._send_channels_values(channels_with_values)
  File "/usr/local/lib/python3.6/dist-packages/neptune/experiments.py", line 1138, in _send_channels_values
    self._backend.send_channels_values(self, channels_with_values)
  File "/usr/local/lib/python3.6/dist-packages/neptune/utils.py", line 210, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages

l_max_probs:  tensor([0.6055, 0.5550, 0.6218, 0.5136, 0.5769, 0.5380, 0.5483, 0.6406, 0.5315,
        0.5821, 0.5221, 0.6152, 0.5492, 0.5829, 0.5514, 0.5037],
       device='cuda:0')
l_max_max_probs:  tensor(0.6406, device='cuda:0')
labels_hat:  tensor([1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], device='cuda:0')
l_max_probs:  tensor([0.5472, 0.6666, 0.5757, 0.5249, 0.6219, 0.5839, 0.6340, 0.5153, 0.5577,
        0.5656, 0.5414, 0.6019, 0.5450, 0.6099, 0.5845, 0.5179],
       device='cuda:0')
l_max_max_probs:  tensor(0.6666, device='cuda:0')
labels_hat:  tensor([1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], device='cuda:0')
l_max_probs:  tensor([0.5456, 0.5056, 0.5010, 0.5929, 0.5630, 0.5926, 0.6633, 0.5121, 0.5649,
        0.5329, 0.5110, 0.5847, 0.5758, 0.5104, 0.5952, 0.6033],
       device='cuda:0')
l_max_max_probs:  tensor(0.6633, device='cuda:0')
labels_hat:  tensor([1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0], device='cuda:0')
l_max_probs:  tensor([0.5669, 0.5881, 0.

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

l_max_probs:  tensor([0.5687, 0.6249, 0.5221, 0.6430, 0.5707, 0.5262, 0.5576, 0.5719, 0.6022,
        0.6719, 0.6348, 0.6289, 0.6600, 0.5505, 0.5639, 0.6406],
       device='cuda:0')
l_max_max_probs:  tensor(0.6719, device='cuda:0')
labels_hat:  tensor([1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], device='cuda:0')
l_max_probs:  tensor([0.5702, 0.6318, 0.5823, 0.6211, 0.5584, 0.6190, 0.5462, 0.6037, 0.5863,
        0.5972, 0.5127, 0.5488, 0.6473, 0.6120, 0.5926, 0.6751],
       device='cuda:0')
l_max_max_probs:  tensor(0.6751, device='cuda:0')
labels_hat:  tensor([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], device='cuda:0')
l_max_probs:  tensor([0.6368, 0.5717, 0.5249, 0.6285, 0.6852, 0.5879, 0.5522, 0.5451, 0.5956,
        0.6822, 0.5508, 0.6257, 0.5923, 0.5756, 0.5495, 0.5583],
       device='cuda:0')
l_max_max_probs:  tensor(0.6852, device='cuda:0')
labels_hat:  tensor([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], device='cuda:0')
l_max_probs:  tensor([0.5948, 0.6334, 0.

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

l_max_probs:  tensor([0.5530, 0.6389, 0.7400, 0.6445, 0.6445, 0.6626, 0.7502, 0.6765, 0.5637,
        0.6024, 0.5610, 0.6642, 0.6030, 0.6110, 0.5350, 0.6696],
       device='cuda:0')
l_max_max_probs:  tensor(0.7502, device='cuda:0')
labels_hat:  tensor([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0], device='cuda:0')
l_max_probs:  tensor([0.6316, 0.6350, 0.6460, 0.5396, 0.5507, 0.6736, 0.5875, 0.5865, 0.5836,
        0.6140, 0.6031, 0.6016, 0.5569, 0.5881, 0.5870, 0.6685],
       device='cuda:0')
l_max_max_probs:  tensor(0.6736, device='cuda:0')
labels_hat:  tensor([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], device='cuda:0')
l_max_probs:  tensor([0.6662, 0.5227, 0.5976, 0.6704, 0.5343, 0.5936, 0.6326, 0.6639, 0.6732,
        0.6089, 0.5238, 0.6820, 0.5753, 0.5650, 0.5232, 0.5635],
       device='cuda:0')
l_max_max_probs:  tensor(0.6820, device='cuda:0')
labels_hat:  tensor([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], device='cuda:0')
l_max_probs:  tensor([0.5772, 0.5405, 0.

In [None]:
# df = pd.read_csv(str(Path(os.environ['DATASETS_PATH'])/'imdb_with_bt.csv'))
# df
# df = df.rename(columns={'intent': 'label', 'bt': 'paraphrases'})
# df.to_csv(str(Path(os.environ['DATASETS_PATH'])/'imdb_with_bt.csv'))

### Hyperparameter search


#### System spec


In [None]:
class LitYKConv_HPO(pl.LightningModule):

  def __init__(self, hparams, trial):

    super().__init__()
    self.hparams = hparams

    if self.hparams['with_VAT']:
      xi_interval = list(map(float, hparams['xi'].split(',')))
      eps_interval = list(map(float, hparams['eps'].split(',')))
      ip_interval = list(map(int, hparams['ip'].split(',')))
      alpha_interval = list(map(float, hparams['alpha'].split(',')))

      self.xi = trial.suggest_uniform('xi', *xi_interval)
      print('xi: ', self.xi)
      self.eps = trial.suggest_uniform('eps', *eps_interval)
      print('eps: ', self.eps)
      self.ip = trial.suggest_int('ip', *ip_interval)
      print('ip: ', self.ip)
      self.alpha = trial.suggest_uniform('alpha', *alpha_interval)
      print('alpha: ', self.alpha)

    lr_interval = list(map(float, hparams['lr'].split(',')))
    print(lr_interval)
    kernel_num_interval = list(map(int, hparams['kernel_num'].split(','))) 
    print(kernel_num_interval)
    dropout_interval = list(map(float, hparams['dropout'].split(','))) 
    print(dropout_interval)

    self.lr = trial.suggest_loguniform('learning_rate', *lr_interval)
    print('lr: ', self.lr)
    self.Co = trial.suggest_int('kernel_num', *kernel_num_interval) #hparams['kernel_num']
    print('kernel_num: ', self.Co)
    dropout = trial.suggest_uniform('dropout', *dropout_interval)
    print('dropout: ', dropout)
    self.dropout = nn.Dropout(dropout)


    self.embeder_dict = {
                         'fastText': (create_ft_embeder, gensim_tokenizer),
                         'spaCy':(create_spacy_nlp_embeder, lambda x: x)
                        }
    embeder, self.tokenizer_fun = self.embeder_dict[hparams['embeder_type']]
    self.embeder = nlp #embeder()
    self.D = hparams['embed_dim']
    self.Ci = hparams['Ci'] 
    
    self.loss_fct = getattr(nn, hparams['loss_function'])()
    self.num_classes = hparams['num_classes']

    self.Ks = list(map(int, hparams['kernel_sizes'].split(','))) # (3,4,5)
    self.convs1 = nn.ModuleList([nn.Conv2d(self.Ci, self.Co, (K, self.D)) for K in self.Ks])
    self.fc1 = nn.Linear(len(self.Ks) * self.Co, self.num_classes) 

    self.total_iterations = 0 

  def forward(self, x):
      # print('org: ', x.size())
      x = x.unsqueeze(self.Ci)  # (N, Ci, W, D)
      # from pdb import set_trace as st
      # st() 
      # print(f'unsqueeze {self.Ci}: {x.size()}')
      x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)
      # print(f'conv, relu, squeeze : {x.size()}')
      x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)
      # print(f'max_pool1d, squeeze : {x.size()}')
      x = torch.cat(x, 1)
      # print(f' cat: {x.size()}')
      x = self.dropout(x)  # (N, len(Ks)*Co)
      logit = self.fc1(x)  # (N, C)
      # print(f' logit: {logit.size()}')
      return logit


  def prepare_data(self):

    if self.hparams['dataset'] == 'NLUHD':

      not_none = lambda x: x["text"] is not None 
      ds = lf.CsvDataset(self.hparams['dataset_path'], header=True).filter(not_none)
      unique_labels = list(pd.DataFrame(ds).intent.unique())
      le = preprocessing.LabelEncoder().fit(unique_labels)
      print(f"Unique labels: {unique_labels}")
      print(f"Number of unique labels: {len(unique_labels)}")
      train, test = lf.cross_validation.split_dataset_random(ds,
                                                            int(len(ds) * self.hparams['train_test_split']),
                                                            seed=self.hparams['seed'])
      preprocessor = partial(
                            preprocess_NLUHD,
                            self.hparams['model_type'],
                            self.hparams['max_sentence_len'],
                            self.tokenizer_dict[self.hparams['model_type']],
                            le,
                            )
      
    elif self.hparams['dataset'] == 'MR':

      preprocessor = partial(
                            preprocess_MR,
                            self.hparams,
                            self.tokenizer_fun, 
                            self.embeder,
                            le,
                            )

    elif self.hparams['dataset'] == 'IMDB':

      ds = lfds.Imdb('train') + lfds.Imdb('test')
      ds = ds.map(lambda x: {'text': x[0], 'label': x[1]})
      df = pd.DataFrame(ds)
      # self.embeder.build_vocab(new_sentences, update=True)
      # self.embeder.train(new_sentences, total_examples=len(new_sentences), epochs=)
      print(df.info(memory_usage=True))
      unique_labels = list(df.label.unique())
      print(f'unique_labels: {unique_labels}')
      print(f'number_of_categories : {len(unique_labels)}')
      le = preprocessing.LabelEncoder().fit(unique_labels)
      train, test = lf.cross_validation.split_dataset_random(ds,
                                                             int(len(ds) * self.hparams['train_test_split']),
                                                             seed=self.hparams['seed'])
      preprocessor = partial(
                             preprocess_IMDB,
                             self.hparams,
                             self.tokenizer_fun, 
                             self.embeder,
                             le,
                            )
    else:
      raise ValueError('Wrong dataset name : {}'.format(self.hparams['dataset']))

    
    train_df, test_df = pd.DataFrame(train), pd.DataFrame(test)
    x_train, y_train = train_df['text'].values, train_df['label'].values
    x_test, y_test = test_df['text'].values, test_df['label'].values

    # split's parameters  
    num_classes = len(unique_labels)
    label_per_class = self.hparams['n_labeled'] // num_classes
    valid_size = self.hparams['valid_size_per_class']

    labeled_idx = []
    unlabeled_idx = []
    val_idx = []
    
    for label in unique_labels:
        idx = np.where(y_train == label)[0]
        np.random.shuffle(idx)
        labeled_idx.extend(idx[:label_per_class])
        val_idx.extend(idx[label_per_class: label_per_class + valid_size])
        unlabeled_idx.extend(idx[label_per_class + valid_size:])

    x_labeled, y_labeled  = x_train[labeled_idx], y_train[labeled_idx]
    x_unlabeled, y_unlabeled = x_train[unlabeled_idx], y_train[unlabeled_idx]
    x_val, y_val = x_train[val_idx], y_train[val_idx]
    
    train_labeled_dataset = SimpleTextDataset(x_labeled,
                                              y_labeled,
                                              transform=preprocessor)
    
    train_unlabeled_dataset = SimpleTextDataset(x_unlabeled,
                                                y_unlabeled,
                                                transform=preprocessor)
    
    self._train_dataset = TwoInOneDataset([train_labeled_dataset,
                                            train_unlabeled_dataset])
    
    self._val_dataset = SimpleTextDataset(x_val,
                                          y_val,
                                          transform=preprocessor)
    
    self._test_dataset = SimpleTextDataset(x_test,
                                           y_test,
                                           transform=preprocessor)
    
    self.total_iterations = len(train_unlabeled_dataset) // self.hparams['batch_size'] 


  def train_dataloader(self):
    return DataLoader(
                      self._train_dataset,
                      batch_size=self.hparams['batch_size'],
                      num_workers=8,
                      shuffle=True # without shuffle it want work cause it need to create map index before __get_item__ function
                     )
    
  
  def val_dataloader(self):
    return DataLoader(
                      self._val_dataset,
                      batch_size=self.hparams['batch_size'],
                      num_workers=8
                     )
    
  
  def test_dataloader(self):
    return DataLoader(
                      self._test_dataset,
                      batch_size=self.hparams['batch_size'],
                      num_workers=8
                     )
    
  
  def configure_optimizers(self):

    optimizers = [
                  torch.optim.Adam(self.parameters(), lr=self.lr),
                 ]
    schedulers = [
                  {
                    'scheduler': ReduceLROnPlateau(optimizers[0],'min', verbose=True), 
                    'monitor': 'val_loss', # Default: val_loss
                    'interval': 'epoch',
                    'frequency': 1
                  },
                 ]

    return optimizers, schedulers


  def training_step(self, batch, batch_idx):

    l_batch = batch[0]
    l_texts = l_batch['embedding']
    labels = l_batch['label']
    unl_texts = batch[1]['embedding']

    if self.hparams['with_VAT']:
      vat_loss = VATLoss(xi=self.xi,
                         eps=self.eps,
                         ip=self.ip)
      lds = vat_loss(self, unl_texts)

    logits = self.forward(l_texts)
    loss = self.loss_fct(logits, labels)

    if self.hparams['with_VAT']:
      loss += self.alpha * lds 

    labels_hat = logits.max(dim=1)[1]

    labels = labels.detach().cpu()
    labels_hat = labels_hat.detach().cpu()
    
    accuracy_error = torch.tensor(1 - accuracy_score(labels, labels_hat))
    f1_error = torch.tensor(1 - f1_score(labels, labels_hat, average='micro'))
    recall_error = torch.tensor(1 - recall_score(labels, labels_hat, average='micro'))
    precision_error = torch.tensor(1 - precision_score(labels, labels_hat, average='micro'))

    logs = {'train_loss': loss,
            'train_accuracy_error': accuracy_error,
            'train_f1_error': f1_error,
            'train_recall_error': recall_error,
            'train_precision_error': precision_error,
           }  

    if self.hparams['with_VAT']:
      logs.update({'lds_loss': lds.item()})


    return {'loss': loss,
            'log': logs}


  def validation_step(self, batch, batch_idx):
    texts = batch['embedding']
    labels = batch['label']
  
    logits = self.forward(texts)
    loss = self.loss_fct(logits, labels)
    labels_hat = torch.argmax(logits, dim=1)

    labels = labels.cpu()
    labels_hat = labels_hat.cpu()

    accuracy_error = torch.tensor(1 - accuracy_score(labels, labels_hat))
    f1_error = torch.tensor(1 - f1_score(labels, labels_hat, average='micro'))
    recall_error = torch.tensor(1 - recall_score(labels, labels_hat, average='micro'))
    precision_error = torch.tensor(1 - precision_score(labels, labels_hat, average='micro'))
  
    output = {
            "val_loss": loss,
            'accuracy_error': accuracy_error,
            'f1_error': f1_error,
            'recall_error': recall_error,
            'precision_error': precision_error,
            }
  
    return output


  def validation_epoch_end(self, outputs):
    # CHANGE FOR TENSORS!!!!
    val_acc = compute_global_metric(outputs, 'accuracy_error')
    val_f1 = compute_global_metric(outputs, 'f1_error')
    val_recall = compute_global_metric(outputs, 'recall_error')
    val_precision = compute_global_metric(outputs, 'precision_error')
    val_loss = compute_global_metric(outputs, "val_loss")

    tqdm_dict = {
                 "val_loss": val_loss,
                 "val_acc": val_acc,
                 "val_f1": val_f1,
                 "val_recall": val_recall,
                 "val_precision": val_precision,
                }
    return {
            "progress_bar": tqdm_dict,
            "log": tqdm_dict,
            "val_loss": val_loss,
            'val_acc': val_acc,
            'val_f1': val_f1
           }


  def test_step(self, batch, batch_idx):

    texts = batch["embedding"]
    labels = batch["label"]
  
    logits = self.forward(texts)
    loss = self.loss_fct(logits, labels)
    labels_hat = torch.argmax(logits, dim=1)

    labels = labels.cpu()
    labels_hat = labels_hat.cpu()


    accuracy_error = torch.tensor(1 - accuracy_score(labels, labels_hat))
    f1_error = torch.tensor(1 - f1_score(labels, labels_hat, average='micro'))
    recall_error = torch.tensor(1 - recall_score(labels, labels_hat, average='micro'))
    precision_error = torch.tensor(1 - precision_score(labels, labels_hat, average='micro'))
  
    return {
            "test_loss": loss,
            'accuracy_error': accuracy_error,
            'f1_error': f1_error,
            'recall_error': recall_error,
            'precision_error': precision_error,
           }


  def test_epoch_end(self, outputs):

    test_acc = compute_global_metric(outputs, 'accuracy_error') 
    test_f1 = compute_global_metric(outputs, 'f1_error')
    test_recall = compute_global_metric(outputs, 'recall_error')
    test_precision = compute_global_metric(outputs, 'precision_error')
    test_loss = compute_global_metric(outputs, "test_loss")

    tqdm_dict = {
                 "test_loss": test_loss,
                 "test_acc": test_acc,
                 "test_f1": test_f1,
                 "test_recall": test_recall,
                 "test_precision": test_precision,
                }
    return {
            "progress_bar": tqdm_dict,
            "log": tqdm_dict,
            "test_loss": test_loss,
            'test_acc': test_acc,
            'test_f1': test_f1
           }

#### Define objective func


In [None]:
def objective(trial):
  # nluhd_dataset_path = str(Path()/
  #                         'gdrive'/
  #                         'My Drive'/
  #                         'praca_magisterska'/
  #                         'pytorch_lightning'/
  #                         'datasets'/
  #                         'NLU-Data-Home-Domain-preprocessed-without-ner.csv')
  hparams = {
            # model architecture
            'model_type': 'YoonKimConvNN',
            'dropout':'0.2,0.7', 
            'kernel_sizes': '3,4,5', # (3,4,5)
            'kernel_num': '60,120', # interval
            'Ci': 1,
            'loss_function':'CrossEntropyLoss',
            # pl trainer params
            'seed': 42,
            'monitor_value': 'val_acc',
            'percent_valid_examples': 0.5, 
            'test_run': False,
            'with_VAT': True,
            'max_epochs': 2,
            'min_epochs': 1,
            'val_check_interval': 0.5, 
            'patience': 3, # early stopping callback parameter
            'distributed_backend': 'dp',
            # embeddings params
            'embeder_type': "fastText",
            'embed_dim': 300,
            'max_sentence_len': 400,
            # dataset params
            'train_test_split': 0.8,
            'batch_size': 32,
            'n_labeled': 1000, # number of labeled samples 
            'valid_size_per_class': 1000, # 68 class => n_val_samples = 68 * 10 
            # optimizer params
            'lr': '0.00001, 10',
            # VAT params
            'xi':'6,12',
            'eps':'1,3',
            'ip':'1,3',
            'alpha':'1,3',
            }
  
  set_seed(hparams['seed'])
  
  hparams.update({'dataset':'IMDB',
                  'num_classes': 2})
  
  
  # training and evaluating model
  
          
  metrics_callback = MetricsCallback()
  
  checkpoint_callback = pl.callbacks.ModelCheckpoint(
          os.path.join(os.environ['RESULT_PATH'],
                       "trial_{}".format(trial.number),
                       "{epoch}"), monitor=hparams['monitor_value']
      )
  
  trainer = pl.Trainer(
                  gpus=1 if torch.cuda.is_available() else None,
                  logger=False,
                  # val_percent_check=hparams['percent_valid_examples'],
                  checkpoint_callback=checkpoint_callback,
                  max_epochs=hparams['max_epochs'],
                  fast_dev_run=hparams['test_run'],
                  callbacks=[metrics_callback],
                  early_stop_callback=PyTorchLightningPruningCallback(trial, monitor=hparams['monitor_value'])
            )
  
  model = LitYKConv_HPO(hparams, trial=trial)
  
  trainer.fit(model)
  return metrics_callback.metrics[-1]["val_acc"]


  

#### Run trails

In [None]:
pruning = True

pruner = optuna.pruners.MedianPruner() if pruning else optuna.pruners.NopPruner()

study = optuna.create_study(direction="minimize", pruner=pruner)
study.optimize(objective, n_trials=20, timeout=None)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
# shutil.rmtree(os.environ['RESULT_PATH'])

In [None]:
optuna.visualization.plot_intermediate_values(study)

In [None]:
optuna.visualization.plot_param_importances(study)