In [1]:
import sys
sys.path.append("..")

import numpy as np
import pandas as pd

import torch
import torchtext

import matplotlib.pyplot as plt
from tqdm import tqdm

import cfg

from text_classification import trainutils
from text_classification.layers import *
from text_classification.logger import logger
from text_classification.datautils import *
from text_classification.trainutils import get_metrics

from train import train

%load_ext autoreload
%autoreload 2

07-18 15:32 summarizer.preprocessing.cleaner INFO     'pattern' package not found; tag filters are not available for English


In [2]:
basepath = '../data/mokoron/' # '../data/airline_tweets_binary/' # 
text_filed = 'text_spellchecked'
text_original_field = 'text_original'
label_field = 'sentiment' # 'airline_sentiment'

alphabet = cfg.alphabet + cfg.russian_chars
alphabet = [c for c in alphabet if c not in ('(', ')')]

# YoonKim

In [3]:
train_data = HierarchicalMokoron(
    basepath + 'train.csv', text_filed, label_field, alphabet=alphabet, max_text_len=128)
valid_data = HierarchicalMokoron(
    basepath + 'validation.csv', text_filed, label_field, alphabet=alphabet, max_text_len=128)
test_data = HierarchicalMokoron(
    basepath + 'test.csv', text_filed, label_field, alphabet=alphabet, max_text_len=128)

In [4]:
train_data.data = train_data.data.sample(1024)
valid_data.data = valid_data.data.sample(1024)

In [5]:
train_dataloader, val_dataloader, test_dataloader = \
    trainutils.get_dataloaders(
        train_data, test_data, validset=valid_data, batch_size=cfg.train.batch_size)


In [6]:
best_f1 = 0

In [7]:
lr = 1e-4
n_filters = 32
cnn_kernel_size = 5
dropout = 0#.5
hidden_dim_out = 64
embedding_dim = int(np.random.randint(32, 128))

params = {
    'n_filters': n_filters,
    'cnn_kernel_size': cnn_kernel_size,
    'hidden_dim_out': hidden_dim_out,
    'embedding_dim': embedding_dim,
    'dropout': dropout,
    'alphabet_len': len(alphabet)
}

model = YoonKimModel(**params)
params['lr'] = lr
logger.info('Parameters: %s' % params)

trained_model = \
    train(model,
          train_dataloader,
          val_dataloader,
          epochs=50,
          noise_level=0,
          lr=lr,
          log_every=2,
          comment='hyperparameters_search_manual',
          save_model_path=None,
          use_annealing=False)
metrics = get_metrics(trained_model, val_dataloader)

if metrics['f1'] > best_f1:
    logger.info('YES!, f1: %s, acc: %s, parameters: %s' % (
        metrics['f1'], metrics['acc'], str(params)
    ))
    best_f1 = metrics['f1']
    best_model = model
    best_params = params
    logger.info(best_params)

07-18 15:32 root         INFO     Parameters: {'n_filters': 32, 'cnn_kernel_size': 5, 'hidden_dim_out': 64, 'embedding_dim': 35, 'dropout': 0, 'alphabet_len': 104, 'lr': 0.0001}
07-18 15:32 root         INFO     Writer: runs/Jul18_15-32-53_lyalin_YoonKimModel_lr4_dropout0_noise_level0.0000hyperparameters_search_manual


RuntimeError: dimension out of range (expected to be in range of [-1, 0], but got 1)

In [8]:
metrics = get_metrics(trained_model, test_dataloader)


07-18 11:46 root         INFO     Set the model into eval mode
  'precision', 'predicted', average, warn_for)


In [9]:
metrics

{'accuracy': 0.8123543123543123, 'f1': 0.0}

In [None]:
lr = 1e-3
n_filters = 32
cnn_kernel_size = 5
dropout = 0.5
hidden_dim_out = 64
embedding_dim = 64

params = {
    'n_filters': n_filters,
    'cnn_kernel_size': cnn_kernel_size,
    'hidden_dim_out': hidden_dim_out,
    'embedding_dim': embedding_dim,
    'dropout': dropout,
    'alphabet_len': len(alphabet)
}

model = YoonKimModel(**params)
params['lr'] = lr
logger.info('Parameters: %s' % params)

trained_model = \
    train(model,
          train_dataloader,
          val_dataloader,
          epochs=10,
          noise_level=0,
          lr=lr,
          log_every=2,
          comment='hyperparameters_search_manual',
          save_model_path=None)
metrics = get_metrics(trained_model, val_dataloader)

if metrics['f1'] > best_f1:
    logger.info('YES!, f1: %s, acc: %s, parameters: %s' % (
        metrics['f1'], metrics['acc'], str(params)
    ))
    best_f1 = metrics['f1']
    best_model = model
    best_params = params
    logger.info(best_params)

In [5]:
best_f1 = 0

In [8]:
if 0 > best_f1:
    best_f1 = 0

for _ in range(100):
    lr = 10**np.random.uniform(-4, -3)
    n_filters = int(np.random.choice([32, 64, 128, 256]))
    cnn_kernel_size = int(np.random.choice([3, 5, 7]))
    dropout = np.random.rand() * 0.9 + 0.1
    hidden_dim_out = int(np.random.choice([64, 128, 256]))
    embedding_dim = int(np.random.choice([32, 64, 128]))

    params = {
        'n_filters': n_filters,
        'cnn_kernel_size': cnn_kernel_size,
        'hidden_dim_out': hidden_dim_out,
        'embedding_dim': embedding_dim,
        'dropout': dropout,
        'alphabet_len': len(alphabet)
    }

    model = YoonKimModel(**params)
    params['lr'] = lr
    logger.info('Parameters: %s' % params)

    trained_model = \
        train(model,
              train_dataloader,
              val_dataloader,
              epochs=10,
              noise_level=0,
              lr=lr,
              log_every=2,
              comment='hyperparameters_search_random',
              save_model_path=None)
    metrics = get_metrics(trained_model, val_dataloader)

    if metrics['f1'] > best_f1:
        logger.info('YES!, f1: %s, parameters: %s' % (metrics['f1'], str(params)))
        best_f1 = metrics['f1']
        best_model = model
        best_params = params
        logger.info(best_params)

07-17 17:19 root         INFO     Parameters: {'n_filters': 128, 'cnn_kernel_size': 5, 'hidden_dim_out': 64, 'embedding_dim': 32, 'dropout': 0.6548361219888224, 'alphabet_len': 104, 'lr': 0.00023854348792259845}
07-17 17:19 root         INFO     Writer: runs/Jul17_17-19-16_lyalin_YoonKimModel_lr3_dropout0.6548361219888224_noise_level0.0000hyperparameters_search_random
07-17 17:25 root         INFO     Epoch 0. Global step 4757. T=6.04min
07-17 17:25 root         INFO     In-batch loss      : 0.6973
07-17 17:25 root         INFO     Training accuracy  : 0.5084, f1: 0.6741
07-17 17:25 root         INFO     Validation accuracy: 0.5070, f1: 0.6728
07-17 17:37 root         INFO     Epoch 2. Global step 14271. T=17.77min
07-17 17:37 root         INFO     In-batch loss      : 0.6951
07-17 17:37 root         INFO     Training accuracy  : 0.5100, f1: 0.6755
07-17 17:37 root         INFO     Validation accuracy: 0.5080, f1: 0.6737
Exception ignored in: <bound method _DataLoaderIter.__del__ of <t

KeyboardInterrupt: 

Traceback (most recent call last):
  File "/home/not_a_robot/.local/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 57, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/not_a_robot/.local/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 138, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/not_a_robot/.local/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 52, in _worker_loop
    r = index_queue.get()
  File "/usr/lib/python3.6/multiprocessing/queues.py", line 335, in get
    res = self._reader.recv_bytes()
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/h