In [44]:
import json
import os
import re
import os.path as osp
import pandas as pd
import glob
import tqdm
##Classe com vários métodos de pre-processamento de texto em português criado pelo grupo F03
import utils.preprocessing_portuguese as preprossPT
from sklearn.model_selection import train_test_split

SEED = 42

# Leitura de arquivo contendo labels

In [2]:
df_labels = pd.read_csv('./resultado_m03_meta_classes_extraction/relacao_documentos_label_v2.csv')
# Mapeia labels em inteiros
df_labels['label_int'] = pd.factorize(df_labels['label'])[0]
df_labels.head()

Unnamed: 0,doc_id,final_meta-class,label,label_int
0,499fa518c7724a6e449c509b7d7bc819,ATA,ata dispensa licitacao,0
1,56a09c5d1d04cc95ada4d68ad22dcbd9,ATA,ata dispensa licitacao,0
2,6e9f35208290c1130fcccad47b50aa53,ATA,ata dispensa licitacao,0
3,669c029c5812ec9a31b9c211044157b4,ATA,ata dispensa licitacao,0
4,a22421dc45d623c9200c7e8fb1e6ca34,ATA,ata dispensa licitacao,0


# Leitura e pré-processamento dos arquivos contendo o texto
- Remover endereços de email

In [3]:
def limpeza_texto(page_text, city_name):
    txt_process = preprossPT.TextPreProcessing()
    city_name = city_name.replace("_", " ")
    
    page_text = txt_process.remove_person_names(page_text)
    
    page_text = page_text.lower()
    
    page_text = txt_process.remove_emails(page_text)

    page_text = txt_process.remove_urls(page_text)
    
    page_text = txt_process.remove_pronouns(page_text)

    page_text = txt_process.remove_adverbs(page_text)

    page_text = txt_process.remove_special_characters(page_text)

    page_text = txt_process.remove_accents(page_text)

    page_text = txt_process.remove_stopwords(page_text)

    page_text = txt_process.remove_hour(page_text)
    # split numbers from letters
    page_text = ' '.join(re.split('(\d+)',page_text))

    page_text = txt_process.remove_symbols_from_numbers(page_text)

    page_text = txt_process.remove_numbers(page_text)

    page_text = txt_process.remove_reduced_or_contracted_words(page_text)
    
    #Removendo letras sozinhas no texto
    #page_text = re.sub(r'(?:^| )\w(?:$| )', ' ', page_text).strip()
    page_text = re.sub(r"\b[a-zA-Z]\b", "", page_text)

    page_text = page_text.replace("_","")
    
    # remove nome do municipio e estado
    page_text = page_text.replace(city_name,"")
    
    page_text = page_text.replace('minas gerais',"")    
    page_text = page_text.replace('prefeitura municipal',"")
    page_text = page_text.replace('prefeitura',"")
    
    page_text = txt_process.remove_excessive_spaces(page_text)
    
    return page_text

In [4]:
def get_name(directory):
    return re.search("licitacoes-(.*)/", directory)[1].replace("-", "_")

def list_json_files_dir(city_dir, city_name):
    if city_name != 'itamarati':
        return glob.glob(os.path.join(city_dir, 'data', 'files_json', '*'))
    else:
        return glob.glob(os.path.join(city_dir, '*', 'data', 'files_json', '*'))

def read_files(file_dir):
    with open(file_dir) as f:
        lines = f.read() # lê o conteúdo (pode ser lido em um stream, se achar necessário)
        return json.loads(lines)
    
def preprocess_text(document, num_pages, city_name):
    return [limpeza_texto(page_content, city_name) for page_content in document['text_content'][:4]]

def merge_pages(document, num_pages):
    # retorna lista onde a cada posicao uma nova pagina e concatenada ao texto
    num_pages+=1
    return [" ".join(document['text_preprocessed'][0:num_pages]) for num_pages in range(1,num_pages)]

In [5]:
base_path = "../data/*licitacoes*/"
cities_dir = glob.glob(base_path)
num_pages = 4

cities_docs = {}
df_document_content = pd.DataFrame(columns=['doc_id', 'city', 'file_dir', 'one_page', 'two_pages', 'three_pages', 'four_pages'])

for city_dir in cities_dir:
    city_name = get_name(city_dir)
    if city_name == "bh":
        continue
    print("-"*100)
    print(city_name)
    # Lista os arquivos a serem lidos
    files_dir = list_json_files_dir(city_dir, city_name)
    # Faz a leitura dos arquivos
    for file_dir in tqdm.tqdm_notebook(files_dir):
        document = read_files(file_dir)
        # Verifica se foi possível extrair texto do documento
        if document['status'] == 'SUCCESS':
            # preprocessamento
            document['text_preprocessed'] = preprocess_text(document, num_pages, city_name)
            # gera variacoes do texto concatenando 1 a num_pages páginas
            page_content = merge_pages(document, num_pages)
            # gera linha a ser inserida no dataframe
            new_row = [document['file_id'], city_name, file_dir]
            new_row.extend(page_content)
            # insere nova linha no dataframe
            df_document_content.loc[len(df_document_content)] = new_row

----------------------------------------------------------------------------------------------------
sao_bento_abade


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=232.0), HTML(value='')))


----------------------------------------------------------------------------------------------------
olaria


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))


----------------------------------------------------------------------------------------------------
coqueiral


HBox(children=(FloatProgress(value=0.0, max=1528.0), HTML(value='')))


----------------------------------------------------------------------------------------------------
cristais


HBox(children=(FloatProgress(value=0.0, max=1736.0), HTML(value='')))


----------------------------------------------------------------------------------------------------
passa_vinte


HBox(children=(FloatProgress(value=0.0, max=395.0), HTML(value='')))


----------------------------------------------------------------------------------------------------
arantina


HBox(children=(FloatProgress(value=0.0, max=937.0), HTML(value='')))


----------------------------------------------------------------------------------------------------
ijaci


HBox(children=(FloatProgress(value=0.0, max=451.0), HTML(value='')))


----------------------------------------------------------------------------------------------------
itamarati


HBox(children=(FloatProgress(value=0.0, max=1110.0), HTML(value='')))


----------------------------------------------------------------------------------------------------
ribeirao_vermelho


HBox(children=(FloatProgress(value=0.0, max=684.0), HTML(value='')))


----------------------------------------------------------------------------------------------------
pirapetinga


HBox(children=(FloatProgress(value=0.0, max=1007.0), HTML(value='')))




# Merge entre texto e labels

In [6]:
df_data = pd.merge(left=df_labels, right=df_document_content, on='doc_id')
df_data.head()

Unnamed: 0,doc_id,final_meta-class,label,label_int,city,file_dir,one_page,two_pages,three_pages,four_pages
0,499fa518c7724a6e449c509b7d7bc819,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes ata dispensa ju...,comissao permanente licitacoes ata dispensa ju...,comissao permanente licitacoes ata dispensa ju...,comissao permanente licitacoes ata dispensa ju...
1,56a09c5d1d04cc95ada4d68ad22dcbd9,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...
2,6e9f35208290c1130fcccad47b50aa53,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,praca cel centro estado cep pabx cnpj comissa...,praca cel centro estado cep pabx cnpj comissa...,praca cel centro estado cep pabx cnpj comissa...,praca cel centro estado cep pabx cnpj comissa...
3,669c029c5812ec9a31b9c211044157b4,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...
4,a22421dc45d623c9200c7e8fb1e6ca34,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes maio reuniu com...,comissao permanente licitacoes maio reuniu com...,comissao permanente licitacoes maio reuniu com...,comissao permanente licitacoes maio reuniu com...


# Separa dados em Treino, Validação e Teste

In [7]:
folds_size = {
    'train' : [0.7],
    'val' : [0.2],
    'test' : [0.1],
}

def calc_real_fold_size(folds_size, labels_df):
    """
    Calculates the real size of each fold.
    """
    num_docs = labels_df.shape[0]
    folds_size['train'].append(round(folds_size['train'][0] * num_docs))
    folds_size['test'].append(round(folds_size['test'][0] * num_docs))
    folds_size['val'].append(round(folds_size['val'][0] * num_docs))
    
    assert num_docs == (folds_size['train'][1] + folds_size['test'][1] + folds_size['val'][1]), "Folds size sum ({}) are different from num_docs ({})".format((folds_size['train'][1] + folds_size['test'][1] + folds_size['val'][1]),
    num_docs)
    
    return folds_size

def split_data(folds_size, labels_df):
    """
    Split data roundo train/val/test folds in a stratified way.
    Input:
        - args: parsed arguments
        - labels_df: dataframe containing images code and their labels
    Output:
        - labels_df: same as input with a new flag porounding images fold
    """
    test_size = folds_size['test'][1]
    val_size = folds_size['val'] [1]
    # First we split test fold
    train, test, _, _ = train_test_split(
        labels_df,
        labels_df['label_int'],
        test_size=test_size,
        random_state=SEED,
        stratify=labels_df['label_int'],
    )
    # Now we split train and val folds
    train, val, _, _ = train_test_split(
        train,
        train['label_int'],
        test_size=val_size,
        random_state=SEED,
        stratify=train['label_int'],
    )

    # Set folds
    labels_df["fold"] = "train"
    labels_df.loc[test.index, "fold"] = "test"
    labels_df.loc[val.index, "fold"] = "val"

    return labels_df

In [8]:
# Faz a separação dos dados
folds_size = calc_real_fold_size(folds_size, df_data)
df_data = split_data(folds_size, df_data)
df_data.head()

Unnamed: 0,doc_id,final_meta-class,label,label_int,city,file_dir,one_page,two_pages,three_pages,four_pages,fold
0,499fa518c7724a6e449c509b7d7bc819,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes ata dispensa ju...,comissao permanente licitacoes ata dispensa ju...,comissao permanente licitacoes ata dispensa ju...,comissao permanente licitacoes ata dispensa ju...,val
1,56a09c5d1d04cc95ada4d68ad22dcbd9,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,test
2,6e9f35208290c1130fcccad47b50aa53,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,praca cel centro estado cep pabx cnpj comissa...,praca cel centro estado cep pabx cnpj comissa...,praca cel centro estado cep pabx cnpj comissa...,praca cel centro estado cep pabx cnpj comissa...,val
3,669c029c5812ec9a31b9c211044157b4,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,val
4,a22421dc45d623c9200c7e8fb1e6ca34,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes maio reuniu com...,comissao permanente licitacoes maio reuniu com...,comissao permanente licitacoes maio reuniu com...,comissao permanente licitacoes maio reuniu com...,train


# Filtra termos por IDF
- Paralelizar

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import time
import multiprocessing as mp

In [10]:
def filter_data(document_text):
    document_text = document_text.split(" ")
    document_text = [term for term in document_text if term in idf_terms_to_keep]        
    return " ".join(document_text)


vectorizer = TfidfVectorizer()
# Calcula tfidf
vectorizer.fit(df_data.loc[df_data['fold'] == 'train', 'four_pages'].values)
# extrai idf
idf_values = vectorizer.idf_
idf_terms = vectorizer.get_feature_names()
# define termos a serem filtrados
index_idf_to_keep = np.nonzero(idf_values < 8)[0].tolist()
idf_values_to_keep = [idf_values[i] for i in index_idf_to_keep]
idf_terms_to_keep = [idf_terms[i] for i in index_idf_to_keep]

# Filtra termos
p = mp.Pool(mp.cpu_count()) # Data parallelism Object
t3 = time.time()
df_data['four_pages_processed'] = p.map(filter_data, df_data['four_pages'])
t4 = time.time()
print("time consuming after Parallel Processing to process the Dataset {0:.2f}s".format(round(t4-t3, 2)))
# Armazena base processada
df_data.to_csv("lstm_data/preprocessed_data_v2.csv", index=False)

time consuming after Parallel Processing to process the Dataset 44.72s


# LSTM Model

In [1]:
#library imports
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error
import tqdm
import random
from sklearn.metrics import f1_score, accuracy_score
import copy
SEED = 42

In [2]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    np.random.seed(seed)
    random.seed(seed)
    
def tokenize (text):
    return [token.text for token in tok.tokenizer(text)]

def encode_sentence(text, vocab2index, N=1000):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

set_seed(SEED)

In [3]:
# Read processed data
df_data = pd.read_csv("lstm_data/preprocessed_data_v2.csv")
#df_data = pd.read_csv("lstm_data/preprocessed_data.csv")

In [4]:
tok = spacy.load('pt_core_news_sm')

#count number of occurences of each word
counts = Counter()
for index, row in df_data.iterrows():
    counts.update(tokenize(row['four_pages_processed']))

#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

# encoding
df_data.loc[df_data['fold'] == 'train','four_pages_encoded'] = df_data.loc[df_data['fold'] == 'train','four_pages_processed'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
df_data.loc[df_data['fold'] == 'val','four_pages_encoded'] = df_data.loc[df_data['fold'] == 'val','four_pages_processed'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
df_data.loc[df_data['fold'] == 'test','four_pages_encoded'] = df_data.loc[df_data['fold'] == 'test','four_pages_processed'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
df_data.head()

  app.launch_new_instance()


Unnamed: 0,doc_id,final_meta-class,label,label_int,city,file_dir,one_page,two_pages,three_pages,four_pages,fold,four_pages_processed,four_pages_encoded
0,499fa518c7724a6e449c509b7d7bc819,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes ata dispensa ju...,comissao permanente licitacoes ata dispensa ju...,comissao permanente licitacoes ata dispensa ju...,comissao permanente licitacoes ata dispensa ju...,val,comissao permanente licitacoes ata dispensa ju...,"[[2, 3, 4, 5, 6, 7, 8, 2, 3, 9, 10, 11, 2, 9, ..."
1,56a09c5d1d04cc95ada4d68ad22dcbd9,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,test,comissao permanente licitacoes dia fevereiro r...,"[[2, 3, 4, 101, 109, 8, 2, 3, 9, 10, 11, 2, 9,..."
2,6e9f35208290c1130fcccad47b50aa53,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,praca cel centro estado cep pabx cnpj comissa...,praca cel centro estado cep pabx cnpj comissa...,praca cel centro estado cep pabx cnpj comissa...,praca cel centro estado cep pabx cnpj comissa...,val,praca cel centro estado cep pabx cnpj comissao...,"[[182, 183, 120, 184, 185, 186, 45, 2, 3, 4, 8..."
3,669c029c5812ec9a31b9c211044157b4,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,val,comissao permanente licitacoes dia fevereiro r...,"[[2, 3, 4, 101, 109, 8, 2, 3, 9, 10, 11, 2, 9,..."
4,a22421dc45d623c9200c7e8fb1e6ca34,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes maio reuniu com...,comissao permanente licitacoes maio reuniu com...,comissao permanente licitacoes maio reuniu com...,comissao permanente licitacoes maio reuniu com...,train,comissao permanente licitacoes maio reuniu com...,"[[2, 3, 4, 235, 8, 2, 3, 9, 10, 236, 94, 12, 1..."


In [6]:
vocab2index

{'': 0,
 'UNK': 1,
 'comissao': 2,
 'permanente': 3,
 'licitacoes': 4,
 'ata': 5,
 'dispensa': 6,
 'julho': 7,
 'reuniu': 8,
 'licitacao': 9,
 'presidente': 10,
 'membros': 11,
 'rogana': 12,
 'edinamara': 13,
 'nomeados': 14,
 'portaria': 15,
 'janeiro': 16,
 'apos': 17,
 'procedida': 18,
 'avaliacao': 19,
 'contratacao': 20,
 'servicos': 21,
 'laboratorio': 22,
 'realizacao': 23,
 'exame': 24,
 'pet': 25,
 'scan': 26,
 'paciente': 27,
 'tratamento': 28,
 'oncologico': 29,
 'pedido': 30,
 'medico': 31,
 'anexo': 32,
 'empresa': 33,
 'pouso': 34,
 'alegre': 35,
 'diagnostico': 36,
 'ltda': 37,
 'aberta': 38,
 'etapa': 39,
 'julgamento': 40,
 'chegou': 41,
 'entao': 42,
 'seguinte': 43,
 'resultado': 44,
 'cnpj': 45,
 'endereco': 46,
 'amaral': 47,
 'paula': 48,
 'jose': 49,
 'alfredo': 50,
 'mg': 51,
 'seq': 52,
 'item': 53,
 'descricao': 54,
 'marca': 55,
 'un': 56,
 'qtd': 57,
 'valor': 58,
 'unitario': 59,
 'total': 60,
 'sub': 61,
 'geral': 62,
 'ato': 63,
 'continuo': 64,
 'proced

In [5]:
X_train = df_data.loc[df_data['fold'] == 'train', 'four_pages_encoded'].values
y_train = df_data.loc[df_data['fold'] == 'train', 'label_int'].values

X_val = df_data.loc[df_data['fold'] == 'val', 'four_pages_encoded'].values
y_val = df_data.loc[df_data['fold'] == 'val', 'label_int'].values

X_test = df_data.loc[df_data['fold'] == 'test', 'four_pages_encoded'].values
y_test = df_data.loc[df_data['fold'] == 'test', 'label_int'].values

In [6]:
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [7]:
train_ds = ReviewsDataset(X_train, y_train)
val_ds = ReviewsDataset(X_val, y_val)
test_ds = ReviewsDataset(X_test, y_test)

In [8]:
def calculate_metrics(y_true, y_pred):
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_weighted = f1_score(y_true, y_pred, average='weighted')
    acc = accuracy_score(y_true, y_pred)
    
    return acc, f1_macro, f1_weighted

def train_model(model, epochs=10, lr=0.001):
    best_model = {
        "model": copy.deepcopy(model),
        "epoch": 0,
        "acc": 0,
        "f1_macro": 0,
        "f1_weighted": 0,
        "loss": 0,
    }
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    
    # if running on GPU and we want to use cuda move model there
    use_cuda = torch.cuda.is_available()
    if use_cuda:
        model = model.cuda()
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        y_true = []
        y_pred = []
        for x, y, l in tqdm.tqdm_notebook(train_dl):
            if use_cuda:
                x = x.long().cuda()
                y = y.long().cuda()
            #x = x.long()
            #y = y.long()
            y_hat = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_hat, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
            y_pred.extend(torch.max(y_hat, 1)[1].cpu().tolist())
            y_true.extend(y.cpu().tolist())
            
        if i % 5 == 4:
            lr = lr*0.9
            parameters = filter(lambda p: p.requires_grad, model.parameters())
            optimizer = torch.optim.Adam(parameters, lr=lr)
        metrics = calculate_metrics(y_true, y_pred)
        print("Train:\n loss %.3f, accuracy %.3f, F1-Macro %.3f, F1-Weighted %.3f, " % (sum_loss/total, metrics[0], metrics[1], metrics[2]))
        val_metrics = validation_metrics(model, val_dl, use_cuda)
        if val_metrics[2] > best_model["f1_macro"]:
            best_model = {
                "model": copy.deepcopy(model),
                "epoch": i,
                "loss": val_metrics[0],
                "acc": val_metrics[1],
                "f1_macro": val_metrics[2],
                "f1_weighted": val_metrics[3],
            }
    return best_model           
        

def validation_metrics (model, valid_dl, use_cuda, is_val = True):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    y_true = []
    y_pred = []
    with torch.no_grad():
        for x, y, l in valid_dl:
            if use_cuda:
                x = x.long().cuda()
                y = y.long().cuda()
            #x = x.long()
            #y = y.long()
            y_hat = model(x, l)
            loss = F.cross_entropy(y_hat, y)
            pred = torch.max(y_hat, 1)[1]
            correct += (pred == y).float().sum()
            total += y.shape[0]
            sum_loss += loss.item()*y.shape[0]
            y_pred.extend(pred.cpu().tolist())
            y_true.extend(y.cpu().tolist())
        
    metrics = calculate_metrics(y_true, y_pred)
    if is_val:
        print("Val:\n loss %.3f, accuracy %.3f, F1-Macro %.3f, F1-Weighted %.3f, " % (sum_loss/total, metrics[0], metrics[1], metrics[2]))
    else:
        print("Test:\n loss %.3f, accuracy %.3f, F1-Macro %.3f, F1-Weighted %.3f, " % (sum_loss/total, metrics[0], metrics[1], metrics[2]))
        
    return (sum_loss/total, metrics[0], metrics[1], metrics[2])

In [9]:
batch_size = 24
vocab_size = len(words)
num_classes = df_data['label'].nunique()
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, )
val_dl = DataLoader(val_ds, batch_size=batch_size)
test_dl = DataLoader(test_ds, batch_size=batch_size)

In [11]:
class LSTM_fixed_len(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [12]:
model_fixed =  LSTM_fixed_len(vocab_size, 200, 200, num_classes)
best_model = train_model(model_fixed, epochs=100, lr=0.0025)
#best_model = train_model(model_fixed, epochs=150, lr=0.01)
best_model

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 2.128, accuracy 0.390, F1-Macro 0.045, F1-Weighted 0.223, 
Val:
 loss 2.085, accuracy 0.393, F1-Macro 0.045, F1-Weighted 0.222, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 1.956, accuracy 0.408, F1-Macro 0.064, F1-Weighted 0.258, 
Val:
 loss 1.668, accuracy 0.469, F1-Macro 0.090, F1-Weighted 0.386, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 1.639, accuracy 0.471, F1-Macro 0.117, F1-Weighted 0.403, 
Val:
 loss 1.529, accuracy 0.463, F1-Macro 0.112, F1-Weighted 0.419, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 1.415, accuracy 0.538, F1-Macro 0.149, F1-Weighted 0.474, 
Val:
 loss 1.268, accuracy 0.579, F1-Macro 0.149, F1-Weighted 0.485, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 1.221, accuracy 0.580, F1-Macro 0.176, F1-Weighted 0.504, 
Val:
 loss 1.165, accuracy 0.584, F1-Macro 0.152, F1-Weighted 0.487, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 1.132, accuracy 0.599, F1-Macro 0.205, F1-Weighted 0.531, 
Val:
 loss 1.063, accuracy 0.632, F1-Macro 0.220, F1-Weighted 0.544, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 1.095, accuracy 0.618, F1-Macro 0.227, F1-Weighted 0.549, 
Val:
 loss 1.078, accuracy 0.629, F1-Macro 0.208, F1-Weighted 0.534, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 1.001, accuracy 0.634, F1-Macro 0.224, F1-Weighted 0.550, 
Val:
 loss 1.055, accuracy 0.613, F1-Macro 0.206, F1-Weighted 0.527, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 1.014, accuracy 0.632, F1-Macro 0.238, F1-Weighted 0.562, 
Val:
 loss 0.987, accuracy 0.639, F1-Macro 0.258, F1-Weighted 0.573, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 1.012, accuracy 0.647, F1-Macro 0.256, F1-Weighted 0.582, 
Val:
 loss 0.987, accuracy 0.672, F1-Macro 0.266, F1-Weighted 0.594, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.878, accuracy 0.692, F1-Macro 0.312, F1-Weighted 0.620, 
Val:
 loss 0.874, accuracy 0.706, F1-Macro 0.345, F1-Weighted 0.633, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.806, accuracy 0.720, F1-Macro 0.367, F1-Weighted 0.655, 
Val:
 loss 1.200, accuracy 0.647, F1-Macro 0.297, F1-Weighted 0.585, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.807, accuracy 0.737, F1-Macro 0.428, F1-Weighted 0.699, 
Val:
 loss 0.689, accuracy 0.790, F1-Macro 0.488, F1-Weighted 0.744, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.557, accuracy 0.816, F1-Macro 0.561, F1-Weighted 0.786, 
Val:
 loss 0.559, accuracy 0.819, F1-Macro 0.548, F1-Weighted 0.793, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.448, accuracy 0.849, F1-Macro 0.620, F1-Weighted 0.826, 
Val:
 loss 0.556, accuracy 0.830, F1-Macro 0.589, F1-Weighted 0.802, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.400, accuracy 0.869, F1-Macro 0.659, F1-Weighted 0.847, 
Val:
 loss 0.417, accuracy 0.889, F1-Macro 0.744, F1-Weighted 0.878, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.413, accuracy 0.873, F1-Macro 0.673, F1-Weighted 0.857, 
Val:
 loss 0.402, accuracy 0.898, F1-Macro 0.722, F1-Weighted 0.883, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.295, accuracy 0.914, F1-Macro 0.746, F1-Weighted 0.898, 
Val:
 loss 0.344, accuracy 0.892, F1-Macro 0.719, F1-Weighted 0.877, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.248, accuracy 0.923, F1-Macro 0.792, F1-Weighted 0.914, 
Val:
 loss 0.304, accuracy 0.912, F1-Macro 0.757, F1-Weighted 0.898, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.240, accuracy 0.922, F1-Macro 0.787, F1-Weighted 0.914, 
Val:
 loss 0.318, accuracy 0.903, F1-Macro 0.748, F1-Weighted 0.895, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.216, accuracy 0.931, F1-Macro 0.822, F1-Weighted 0.927, 
Val:
 loss 0.371, accuracy 0.886, F1-Macro 0.751, F1-Weighted 0.876, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.227, accuracy 0.925, F1-Macro 0.813, F1-Weighted 0.922, 
Val:
 loss 0.294, accuracy 0.924, F1-Macro 0.834, F1-Weighted 0.921, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.179, accuracy 0.945, F1-Macro 0.857, F1-Weighted 0.943, 
Val:
 loss 0.273, accuracy 0.935, F1-Macro 0.860, F1-Weighted 0.933, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.189, accuracy 0.938, F1-Macro 0.834, F1-Weighted 0.935, 
Val:
 loss 0.261, accuracy 0.933, F1-Macro 0.823, F1-Weighted 0.926, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.160, accuracy 0.947, F1-Macro 0.861, F1-Weighted 0.944, 
Val:
 loss 0.288, accuracy 0.930, F1-Macro 0.838, F1-Weighted 0.928, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.149, accuracy 0.948, F1-Macro 0.872, F1-Weighted 0.946, 
Val:
 loss 0.255, accuracy 0.948, F1-Macro 0.898, F1-Weighted 0.948, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.122, accuracy 0.963, F1-Macro 0.909, F1-Weighted 0.962, 
Val:
 loss 0.270, accuracy 0.935, F1-Macro 0.860, F1-Weighted 0.936, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.135, accuracy 0.962, F1-Macro 0.915, F1-Weighted 0.962, 
Val:
 loss 0.258, accuracy 0.943, F1-Macro 0.887, F1-Weighted 0.944, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.106, accuracy 0.968, F1-Macro 0.930, F1-Weighted 0.968, 
Val:
 loss 0.239, accuracy 0.945, F1-Macro 0.889, F1-Weighted 0.947, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.094, accuracy 0.974, F1-Macro 0.945, F1-Weighted 0.974, 
Val:
 loss 0.230, accuracy 0.945, F1-Macro 0.892, F1-Weighted 0.946, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.097, accuracy 0.975, F1-Macro 0.947, F1-Weighted 0.975, 
Val:
 loss 0.280, accuracy 0.940, F1-Macro 0.887, F1-Weighted 0.940, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.086, accuracy 0.977, F1-Macro 0.948, F1-Weighted 0.977, 
Val:
 loss 0.260, accuracy 0.944, F1-Macro 0.894, F1-Weighted 0.945, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.079, accuracy 0.978, F1-Macro 0.950, F1-Weighted 0.977, 
Val:
 loss 0.287, accuracy 0.934, F1-Macro 0.878, F1-Weighted 0.934, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.068, accuracy 0.980, F1-Macro 0.960, F1-Weighted 0.980, 
Val:
 loss 0.261, accuracy 0.948, F1-Macro 0.901, F1-Weighted 0.949, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.067, accuracy 0.980, F1-Macro 0.956, F1-Weighted 0.980, 
Val:
 loss 0.276, accuracy 0.950, F1-Macro 0.903, F1-Weighted 0.950, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.065, accuracy 0.980, F1-Macro 0.957, F1-Weighted 0.980, 
Val:
 loss 0.271, accuracy 0.941, F1-Macro 0.885, F1-Weighted 0.941, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.073, accuracy 0.978, F1-Macro 0.952, F1-Weighted 0.978, 
Val:
 loss 0.258, accuracy 0.950, F1-Macro 0.905, F1-Weighted 0.951, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.059, accuracy 0.984, F1-Macro 0.966, F1-Weighted 0.984, 
Val:
 loss 0.250, accuracy 0.953, F1-Macro 0.908, F1-Weighted 0.953, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.071, accuracy 0.982, F1-Macro 0.957, F1-Weighted 0.982, 
Val:
 loss 0.267, accuracy 0.942, F1-Macro 0.892, F1-Weighted 0.943, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.052, accuracy 0.985, F1-Macro 0.968, F1-Weighted 0.985, 
Val:
 loss 0.251, accuracy 0.948, F1-Macro 0.907, F1-Weighted 0.949, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.054, accuracy 0.986, F1-Macro 0.971, F1-Weighted 0.986, 
Val:
 loss 0.261, accuracy 0.955, F1-Macro 0.920, F1-Weighted 0.955, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.051, accuracy 0.986, F1-Macro 0.973, F1-Weighted 0.986, 
Val:
 loss 0.245, accuracy 0.950, F1-Macro 0.909, F1-Weighted 0.950, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.050, accuracy 0.986, F1-Macro 0.971, F1-Weighted 0.986, 
Val:
 loss 0.242, accuracy 0.953, F1-Macro 0.913, F1-Weighted 0.954, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.057, accuracy 0.982, F1-Macro 0.964, F1-Weighted 0.982, 
Val:
 loss 0.251, accuracy 0.947, F1-Macro 0.906, F1-Weighted 0.947, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.043, accuracy 0.987, F1-Macro 0.976, F1-Weighted 0.987, 
Val:
 loss 0.247, accuracy 0.954, F1-Macro 0.919, F1-Weighted 0.954, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.043, accuracy 0.988, F1-Macro 0.976, F1-Weighted 0.988, 
Val:
 loss 0.252, accuracy 0.950, F1-Macro 0.906, F1-Weighted 0.950, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.036, accuracy 0.991, F1-Macro 0.981, F1-Weighted 0.991, 
Val:
 loss 0.244, accuracy 0.955, F1-Macro 0.921, F1-Weighted 0.955, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.034, accuracy 0.990, F1-Macro 0.980, F1-Weighted 0.990, 
Val:
 loss 0.240, accuracy 0.955, F1-Macro 0.922, F1-Weighted 0.955, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.031, accuracy 0.991, F1-Macro 0.983, F1-Weighted 0.991, 
Val:
 loss 0.239, accuracy 0.955, F1-Macro 0.921, F1-Weighted 0.955, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.045, accuracy 0.988, F1-Macro 0.977, F1-Weighted 0.988, 
Val:
 loss 0.249, accuracy 0.955, F1-Macro 0.922, F1-Weighted 0.955, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.045, accuracy 0.988, F1-Macro 0.978, F1-Weighted 0.988, 
Val:
 loss 0.282, accuracy 0.949, F1-Macro 0.904, F1-Weighted 0.949, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.043, accuracy 0.989, F1-Macro 0.979, F1-Weighted 0.989, 
Val:
 loss 0.293, accuracy 0.950, F1-Macro 0.909, F1-Weighted 0.951, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.041, accuracy 0.990, F1-Macro 0.979, F1-Weighted 0.990, 
Val:
 loss 0.273, accuracy 0.955, F1-Macro 0.922, F1-Weighted 0.955, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.050, accuracy 0.986, F1-Macro 0.970, F1-Weighted 0.986, 
Val:
 loss 0.481, accuracy 0.915, F1-Macro 0.852, F1-Weighted 0.913, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.042, accuracy 0.988, F1-Macro 0.976, F1-Weighted 0.988, 
Val:
 loss 0.244, accuracy 0.953, F1-Macro 0.919, F1-Weighted 0.954, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.038, accuracy 0.991, F1-Macro 0.980, F1-Weighted 0.991, 
Val:
 loss 0.294, accuracy 0.942, F1-Macro 0.897, F1-Weighted 0.942, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.029, accuracy 0.992, F1-Macro 0.984, F1-Weighted 0.992, 
Val:
 loss 0.278, accuracy 0.952, F1-Macro 0.917, F1-Weighted 0.952, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.027, accuracy 0.993, F1-Macro 0.987, F1-Weighted 0.993, 
Val:
 loss 0.276, accuracy 0.952, F1-Macro 0.915, F1-Weighted 0.953, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.028, accuracy 0.993, F1-Macro 0.986, F1-Weighted 0.993, 
Val:
 loss 0.268, accuracy 0.955, F1-Macro 0.914, F1-Weighted 0.955, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.038, accuracy 0.989, F1-Macro 0.975, F1-Weighted 0.989, 
Val:
 loss 0.264, accuracy 0.953, F1-Macro 0.912, F1-Weighted 0.953, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.035, accuracy 0.989, F1-Macro 0.977, F1-Weighted 0.989, 
Val:
 loss 0.273, accuracy 0.953, F1-Macro 0.920, F1-Weighted 0.953, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.024, accuracy 0.994, F1-Macro 0.988, F1-Weighted 0.994, 
Val:
 loss 0.265, accuracy 0.963, F1-Macro 0.936, F1-Weighted 0.963, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.027, accuracy 0.993, F1-Macro 0.981, F1-Weighted 0.993, 
Val:
 loss 0.273, accuracy 0.955, F1-Macro 0.923, F1-Weighted 0.955, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.019, accuracy 0.994, F1-Macro 0.990, F1-Weighted 0.994, 
Val:
 loss 0.271, accuracy 0.955, F1-Macro 0.926, F1-Weighted 0.955, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.027, accuracy 0.994, F1-Macro 0.987, F1-Weighted 0.994, 
Val:
 loss 0.297, accuracy 0.951, F1-Macro 0.915, F1-Weighted 0.951, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.018, accuracy 0.995, F1-Macro 0.990, F1-Weighted 0.995, 
Val:
 loss 0.322, accuracy 0.953, F1-Macro 0.913, F1-Weighted 0.953, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.025, accuracy 0.993, F1-Macro 0.986, F1-Weighted 0.993, 
Val:
 loss 0.286, accuracy 0.952, F1-Macro 0.915, F1-Weighted 0.953, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.021, accuracy 0.994, F1-Macro 0.989, F1-Weighted 0.994, 
Val:
 loss 0.285, accuracy 0.954, F1-Macro 0.921, F1-Weighted 0.954, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.027, accuracy 0.991, F1-Macro 0.982, F1-Weighted 0.991, 
Val:
 loss 0.267, accuracy 0.956, F1-Macro 0.927, F1-Weighted 0.956, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.017, accuracy 0.996, F1-Macro 0.992, F1-Weighted 0.996, 
Val:
 loss 0.291, accuracy 0.950, F1-Macro 0.906, F1-Weighted 0.951, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.022, accuracy 0.995, F1-Macro 0.990, F1-Weighted 0.995, 
Val:
 loss 0.285, accuracy 0.951, F1-Macro 0.907, F1-Weighted 0.952, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.014, accuracy 0.995, F1-Macro 0.991, F1-Weighted 0.995, 
Val:
 loss 0.278, accuracy 0.956, F1-Macro 0.923, F1-Weighted 0.956, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.016, accuracy 0.995, F1-Macro 0.990, F1-Weighted 0.995, 
Val:
 loss 0.296, accuracy 0.949, F1-Macro 0.905, F1-Weighted 0.949, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.014, accuracy 0.996, F1-Macro 0.990, F1-Weighted 0.996, 
Val:
 loss 0.290, accuracy 0.950, F1-Macro 0.907, F1-Weighted 0.950, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.022, accuracy 0.994, F1-Macro 0.989, F1-Weighted 0.994, 
Val:
 loss 0.300, accuracy 0.950, F1-Macro 0.908, F1-Weighted 0.950, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.013, accuracy 0.997, F1-Macro 0.995, F1-Weighted 0.997, 
Val:
 loss 0.317, accuracy 0.952, F1-Macro 0.911, F1-Weighted 0.952, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.012, accuracy 0.997, F1-Macro 0.994, F1-Weighted 0.997, 
Val:
 loss 0.313, accuracy 0.950, F1-Macro 0.912, F1-Weighted 0.950, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.013, accuracy 0.996, F1-Macro 0.994, F1-Weighted 0.996, 
Val:
 loss 0.301, accuracy 0.954, F1-Macro 0.917, F1-Weighted 0.954, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.016, accuracy 0.996, F1-Macro 0.994, F1-Weighted 0.996, 
Val:
 loss 0.317, accuracy 0.950, F1-Macro 0.912, F1-Weighted 0.950, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.013, accuracy 0.996, F1-Macro 0.992, F1-Weighted 0.996, 
Val:
 loss 0.319, accuracy 0.949, F1-Macro 0.904, F1-Weighted 0.949, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.011, accuracy 0.997, F1-Macro 0.995, F1-Weighted 0.997, 
Val:
 loss 0.327, accuracy 0.950, F1-Macro 0.914, F1-Weighted 0.950, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.011, accuracy 0.997, F1-Macro 0.994, F1-Weighted 0.997, 
Val:
 loss 0.314, accuracy 0.951, F1-Macro 0.913, F1-Weighted 0.951, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.014, accuracy 0.997, F1-Macro 0.994, F1-Weighted 0.997, 
Val:
 loss 0.314, accuracy 0.954, F1-Macro 0.918, F1-Weighted 0.954, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.015, accuracy 0.996, F1-Macro 0.992, F1-Weighted 0.996, 
Val:
 loss 0.294, accuracy 0.958, F1-Macro 0.922, F1-Weighted 0.957, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.011, accuracy 0.996, F1-Macro 0.994, F1-Weighted 0.996, 
Val:
 loss 0.295, accuracy 0.954, F1-Macro 0.921, F1-Weighted 0.954, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.010, accuracy 0.997, F1-Macro 0.994, F1-Weighted 0.997, 
Val:
 loss 0.341, accuracy 0.954, F1-Macro 0.918, F1-Weighted 0.953, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.008, accuracy 0.998, F1-Macro 0.997, F1-Weighted 0.998, 
Val:
 loss 0.337, accuracy 0.957, F1-Macro 0.917, F1-Weighted 0.956, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.006, accuracy 0.998, F1-Macro 0.997, F1-Weighted 0.998, 
Val:
 loss 0.362, accuracy 0.951, F1-Macro 0.910, F1-Weighted 0.951, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.008, accuracy 0.998, F1-Macro 0.996, F1-Weighted 0.998, 
Val:
 loss 0.315, accuracy 0.954, F1-Macro 0.923, F1-Weighted 0.953, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.010, accuracy 0.997, F1-Macro 0.996, F1-Weighted 0.997, 
Val:
 loss 0.344, accuracy 0.954, F1-Macro 0.913, F1-Weighted 0.953, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.007, accuracy 0.997, F1-Macro 0.997, F1-Weighted 0.997, 
Val:
 loss 0.347, accuracy 0.953, F1-Macro 0.913, F1-Weighted 0.953, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.006, accuracy 0.998, F1-Macro 0.997, F1-Weighted 0.998, 
Val:
 loss 0.360, accuracy 0.951, F1-Macro 0.909, F1-Weighted 0.950, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.005, accuracy 0.998, F1-Macro 0.998, F1-Weighted 0.998, 
Val:
 loss 0.319, accuracy 0.958, F1-Macro 0.923, F1-Weighted 0.958, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.007, accuracy 0.998, F1-Macro 0.995, F1-Weighted 0.998, 
Val:
 loss 0.317, accuracy 0.958, F1-Macro 0.921, F1-Weighted 0.958, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.005, accuracy 0.998, F1-Macro 0.997, F1-Weighted 0.998, 
Val:
 loss 0.315, accuracy 0.955, F1-Macro 0.915, F1-Weighted 0.955, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.007, accuracy 0.998, F1-Macro 0.996, F1-Weighted 0.998, 
Val:
 loss 0.327, accuracy 0.955, F1-Macro 0.918, F1-Weighted 0.955, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.009, accuracy 0.998, F1-Macro 0.997, F1-Weighted 0.998, 
Val:
 loss 0.340, accuracy 0.955, F1-Macro 0.916, F1-Weighted 0.955, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.007, accuracy 0.998, F1-Macro 0.997, F1-Weighted 0.998, 
Val:
 loss 0.332, accuracy 0.956, F1-Macro 0.916, F1-Weighted 0.956, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.009, accuracy 0.997, F1-Macro 0.994, F1-Weighted 0.997, 
Val:
 loss 0.343, accuracy 0.954, F1-Macro 0.918, F1-Weighted 0.954, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.013, accuracy 0.997, F1-Macro 0.994, F1-Weighted 0.997, 
Val:
 loss 0.304, accuracy 0.955, F1-Macro 0.920, F1-Weighted 0.955, 


{'model': LSTM_fixed_len(
   (embeddings): Embedding(12822, 200, padding_idx=0)
   (lstm): LSTM(200, 200, batch_first=True)
   (linear): Linear(in_features=200, out_features=13, bias=True)
   (dropout): Dropout(p=0.3, inplace=False)
 ),
 'epoch': 61,
 'loss': 0.26538012502552877,
 'acc': 0.9628647214854111,
 'f1_macro': 0.9364803750981582,
 'f1_weighted': 0.9630990616617998}

In [13]:
validation_metrics(best_model['model'], val_dl, use_cuda = True, is_val = True)
validation_metrics(best_model['model'], test_dl, use_cuda = True, is_val = False)

  self.dropout, self.training, self.bidirectional, self.batch_first)


Val:
 loss 0.265, accuracy 0.963, F1-Macro 0.936, F1-Weighted 0.963, 
Test:
 loss 0.263, accuracy 0.952, F1-Macro 0.918, F1-Weighted 0.952, 


(0.2631526180191699, 0.952212389380531, 0.9183689828509765, 0.952362619875942)

In [12]:
class LSTM_variable_input(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.3)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 13)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return out

In [13]:
model = LSTM_variable_input(vocab_size, 150, 150)
train_model(model, epochs=50, lr=0.005)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 1.036, accuracy 0.680, F1-Macro 0.542, F1-Weighted 0.662, 
Val:
 loss 0.600, accuracy 0.815, F1-Macro 0.673, F1-Weighted 0.795, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.432, accuracy 0.870, F1-Macro 0.787, F1-Weighted 0.867, 
Val:
 loss 0.470, accuracy 0.875, F1-Macro 0.809, F1-Weighted 0.875, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.258, accuracy 0.924, F1-Macro 0.874, F1-Weighted 0.923, 
Val:
 loss 0.421, accuracy 0.894, F1-Macro 0.836, F1-Weighted 0.892, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.200, accuracy 0.939, F1-Macro 0.896, F1-Weighted 0.939, 
Val:
 loss 0.418, accuracy 0.910, F1-Macro 0.855, F1-Weighted 0.909, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.181, accuracy 0.950, F1-Macro 0.917, F1-Weighted 0.950, 
Val:
 loss 0.466, accuracy 0.894, F1-Macro 0.828, F1-Weighted 0.892, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.147, accuracy 0.958, F1-Macro 0.923, F1-Weighted 0.957, 
Val:
 loss 0.442, accuracy 0.905, F1-Macro 0.856, F1-Weighted 0.905, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.103, accuracy 0.968, F1-Macro 0.945, F1-Weighted 0.968, 
Val:
 loss 0.499, accuracy 0.899, F1-Macro 0.845, F1-Weighted 0.898, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.070, accuracy 0.979, F1-Macro 0.965, F1-Weighted 0.979, 
Val:
 loss 0.484, accuracy 0.906, F1-Macro 0.859, F1-Weighted 0.907, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.058, accuracy 0.981, F1-Macro 0.966, F1-Weighted 0.981, 
Val:
 loss 0.493, accuracy 0.905, F1-Macro 0.858, F1-Weighted 0.904, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.060, accuracy 0.982, F1-Macro 0.969, F1-Weighted 0.982, 
Val:
 loss 0.514, accuracy 0.892, F1-Macro 0.835, F1-Weighted 0.892, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.061, accuracy 0.982, F1-Macro 0.969, F1-Weighted 0.981, 
Val:
 loss 0.524, accuracy 0.913, F1-Macro 0.877, F1-Weighted 0.913, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.051, accuracy 0.985, F1-Macro 0.973, F1-Weighted 0.985, 
Val:
 loss 0.492, accuracy 0.909, F1-Macro 0.856, F1-Weighted 0.909, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.040, accuracy 0.988, F1-Macro 0.979, F1-Weighted 0.988, 
Val:
 loss 0.527, accuracy 0.908, F1-Macro 0.859, F1-Weighted 0.908, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.040, accuracy 0.988, F1-Macro 0.979, F1-Weighted 0.988, 
Val:
 loss 0.525, accuracy 0.904, F1-Macro 0.856, F1-Weighted 0.905, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.040, accuracy 0.989, F1-Macro 0.982, F1-Weighted 0.989, 
Val:
 loss 0.618, accuracy 0.902, F1-Macro 0.855, F1-Weighted 0.902, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.031, accuracy 0.989, F1-Macro 0.979, F1-Weighted 0.989, 
Val:
 loss 0.617, accuracy 0.907, F1-Macro 0.855, F1-Weighted 0.907, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.034, accuracy 0.989, F1-Macro 0.983, F1-Weighted 0.989, 
Val:
 loss 0.621, accuracy 0.902, F1-Macro 0.848, F1-Weighted 0.901, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.034, accuracy 0.990, F1-Macro 0.983, F1-Weighted 0.990, 
Val:
 loss 0.625, accuracy 0.909, F1-Macro 0.867, F1-Weighted 0.909, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.037, accuracy 0.989, F1-Macro 0.981, F1-Weighted 0.989, 
Val:
 loss 0.601, accuracy 0.906, F1-Macro 0.863, F1-Weighted 0.906, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.031, accuracy 0.991, F1-Macro 0.985, F1-Weighted 0.991, 
Val:
 loss 0.594, accuracy 0.906, F1-Macro 0.859, F1-Weighted 0.906, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.028, accuracy 0.993, F1-Macro 0.987, F1-Weighted 0.993, 
Val:
 loss 0.607, accuracy 0.900, F1-Macro 0.842, F1-Weighted 0.901, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.020, accuracy 0.995, F1-Macro 0.991, F1-Weighted 0.995, 
Val:
 loss 0.573, accuracy 0.908, F1-Macro 0.855, F1-Weighted 0.907, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.016, accuracy 0.995, F1-Macro 0.991, F1-Weighted 0.995, 
Val:
 loss 0.635, accuracy 0.905, F1-Macro 0.852, F1-Weighted 0.904, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.029, accuracy 0.992, F1-Macro 0.988, F1-Weighted 0.992, 
Val:
 loss 0.619, accuracy 0.905, F1-Macro 0.851, F1-Weighted 0.906, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.023, accuracy 0.992, F1-Macro 0.987, F1-Weighted 0.992, 
Val:
 loss 0.642, accuracy 0.908, F1-Macro 0.853, F1-Weighted 0.908, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.019, accuracy 0.994, F1-Macro 0.990, F1-Weighted 0.994, 
Val:
 loss 0.650, accuracy 0.906, F1-Macro 0.859, F1-Weighted 0.906, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.019, accuracy 0.993, F1-Macro 0.989, F1-Weighted 0.993, 
Val:
 loss 0.636, accuracy 0.908, F1-Macro 0.858, F1-Weighted 0.907, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.020, accuracy 0.994, F1-Macro 0.989, F1-Weighted 0.994, 
Val:
 loss 0.632, accuracy 0.900, F1-Macro 0.843, F1-Weighted 0.899, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.013, accuracy 0.997, F1-Macro 0.995, F1-Weighted 0.997, 
Val:
 loss 0.641, accuracy 0.905, F1-Macro 0.851, F1-Weighted 0.905, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.013, accuracy 0.996, F1-Macro 0.993, F1-Weighted 0.996, 
Val:
 loss 0.688, accuracy 0.905, F1-Macro 0.854, F1-Weighted 0.906, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.013, accuracy 0.996, F1-Macro 0.993, F1-Weighted 0.996, 
Val:
 loss 0.695, accuracy 0.904, F1-Macro 0.848, F1-Weighted 0.903, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.014, accuracy 0.995, F1-Macro 0.991, F1-Weighted 0.995, 
Val:
 loss 0.627, accuracy 0.912, F1-Macro 0.863, F1-Weighted 0.912, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.014, accuracy 0.996, F1-Macro 0.994, F1-Weighted 0.996, 
Val:
 loss 0.636, accuracy 0.915, F1-Macro 0.871, F1-Weighted 0.915, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.012, accuracy 0.995, F1-Macro 0.992, F1-Weighted 0.995, 
Val:
 loss 0.644, accuracy 0.912, F1-Macro 0.863, F1-Weighted 0.911, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.017, accuracy 0.995, F1-Macro 0.992, F1-Weighted 0.995, 
Val:
 loss 0.628, accuracy 0.920, F1-Macro 0.881, F1-Weighted 0.920, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.012, accuracy 0.996, F1-Macro 0.995, F1-Weighted 0.996, 
Val:
 loss 0.618, accuracy 0.913, F1-Macro 0.865, F1-Weighted 0.913, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.012, accuracy 0.996, F1-Macro 0.994, F1-Weighted 0.996, 
Val:
 loss 0.684, accuracy 0.911, F1-Macro 0.861, F1-Weighted 0.911, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.012, accuracy 0.997, F1-Macro 0.995, F1-Weighted 0.997, 
Val:
 loss 0.650, accuracy 0.912, F1-Macro 0.865, F1-Weighted 0.912, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.009, accuracy 0.997, F1-Macro 0.995, F1-Weighted 0.997, 
Val:
 loss 0.647, accuracy 0.917, F1-Macro 0.872, F1-Weighted 0.917, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.009, accuracy 0.997, F1-Macro 0.995, F1-Weighted 0.997, 
Val:
 loss 0.659, accuracy 0.915, F1-Macro 0.868, F1-Weighted 0.915, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.010, accuracy 0.997, F1-Macro 0.995, F1-Weighted 0.997, 
Val:
 loss 0.687, accuracy 0.918, F1-Macro 0.874, F1-Weighted 0.918, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.009, accuracy 0.997, F1-Macro 0.995, F1-Weighted 0.997, 
Val:
 loss 0.711, accuracy 0.914, F1-Macro 0.867, F1-Weighted 0.915, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.009, accuracy 0.996, F1-Macro 0.994, F1-Weighted 0.996, 
Val:
 loss 0.738, accuracy 0.911, F1-Macro 0.859, F1-Weighted 0.911, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.010, accuracy 0.996, F1-Macro 0.994, F1-Weighted 0.996, 
Val:
 loss 0.728, accuracy 0.912, F1-Macro 0.860, F1-Weighted 0.911, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.006, accuracy 0.998, F1-Macro 0.998, F1-Weighted 0.998, 
Val:
 loss 0.741, accuracy 0.916, F1-Macro 0.870, F1-Weighted 0.916, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.008, accuracy 0.998, F1-Macro 0.997, F1-Weighted 0.998, 
Val:
 loss 0.760, accuracy 0.917, F1-Macro 0.865, F1-Weighted 0.917, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.007, accuracy 0.998, F1-Macro 0.996, F1-Weighted 0.998, 
Val:
 loss 0.804, accuracy 0.917, F1-Macro 0.869, F1-Weighted 0.917, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.009, accuracy 0.997, F1-Macro 0.995, F1-Weighted 0.997, 
Val:
 loss 0.769, accuracy 0.913, F1-Macro 0.863, F1-Weighted 0.913, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.008, accuracy 0.997, F1-Macro 0.995, F1-Weighted 0.997, 
Val:
 loss 0.763, accuracy 0.912, F1-Macro 0.866, F1-Weighted 0.912, 


HBox(children=(FloatProgress(value=0.0, max=165.0), HTML(value='')))


Train:
 loss 0.010, accuracy 0.997, F1-Macro 0.995, F1-Weighted 0.997, 
Val:
 loss 0.724, accuracy 0.913, F1-Macro 0.859, F1-Weighted 0.914, 


# Sandbox

In [40]:
df_train = df_data.loc[df_data['fold']=="train"].copy()

In [41]:
tok = spacy.load('pt_core_news_sm')

#count number of occurences of each word

counts = Counter()
for index, row in df_train.iterrows():
    counts.update(tokenize(row['four_pages_processed']))
 #creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)
print(len(words))   


13081


In [32]:

# encoding
df_data.loc[df_data['fold'] == 'train','four_pages_encoded'] = df_data.loc[df_data['fold'] == 'train','four_pages_processed'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
df_data.loc[df_data['fold'] == 'val','four_pages_encoded'] = df_data.loc[df_data['fold'] == 'val','four_pages_processed'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
df_data.loc[df_data['fold'] == 'test','four_pages_encoded'] = df_data.loc[df_data['fold'] == 'test','four_pages_processed'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
df_data.head()

13081


  app.launch_new_instance()


Unnamed: 0,doc_id,final_meta-class,label,label_int,city,file_dir,one_page,two_pages,three_pages,four_pages,fold,four_pages_processed,four_pages_encoded
0,499fa518c7724a6e449c509b7d7bc819,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes ata dispensa ju...,comissao permanente licitacoes ata dispensa ju...,comissao permanente licitacoes ata dispensa ju...,comissao permanente licitacoes ata dispensa ju...,train,comissao permanente licitacoes ata dispensa ju...,"[[2, 3, 4, 5, 6, 7, 8, 2, 3, 9, 10, 11, 2, 9, ..."
1,56a09c5d1d04cc95ada4d68ad22dcbd9,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,test,comissao permanente licitacoes dia fevereiro r...,"[[2, 3, 4, 101, 148, 8, 2, 3, 9, 10, 11, 2, 9,..."
2,6e9f35208290c1130fcccad47b50aa53,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,praca cel centro estado cep pabx cnpj comissa...,praca cel centro estado cep pabx cnpj comissa...,praca cel centro estado cep pabx cnpj comissa...,praca cel centro estado cep pabx cnpj comissa...,train,praca cel centro estado cep pabx cnpj comissao...,"[[109, 110, 111, 112, 113, 114, 45, 2, 3, 4, 8..."
3,669c029c5812ec9a31b9c211044157b4,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,train,comissao permanente licitacoes dia fevereiro r...,"[[2, 3, 4, 101, 148, 8, 2, 3, 9, 10, 11, 2, 9,..."
4,a22421dc45d623c9200c7e8fb1e6ca34,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes maio reuniu com...,comissao permanente licitacoes maio reuniu com...,comissao permanente licitacoes maio reuniu com...,comissao permanente licitacoes maio reuniu com...,train,comissao permanente licitacoes maio reuniu com...,"[[2, 3, 4, 224, 8, 2, 3, 9, 10, 225, 94, 12, 1..."


In [34]:
tok = spacy.load('pt_core_news_sm')

#count number of occurences of each word
counts = Counter()
for index, row in df_data.iterrows():
    counts.update(tokenize(row['four_pages_processed']))

#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)
print(len(words))
# encoding
df_data.loc[df_data['fold'] == 'train','four_pages_encoded'] = df_data.loc[df_data['fold'] == 'train','four_pages_processed'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
df_data.loc[df_data['fold'] == 'val','four_pages_encoded'] = df_data.loc[df_data['fold'] == 'val','four_pages_processed'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
df_data.loc[df_data['fold'] == 'test','four_pages_encoded'] = df_data.loc[df_data['fold'] == 'test','four_pages_processed'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
df_data.head()

13081


  app.launch_new_instance()


Unnamed: 0,doc_id,final_meta-class,label,label_int,city,file_dir,one_page,two_pages,three_pages,four_pages,fold,four_pages_processed,four_pages_encoded
0,499fa518c7724a6e449c509b7d7bc819,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes ata dispensa ju...,comissao permanente licitacoes ata dispensa ju...,comissao permanente licitacoes ata dispensa ju...,comissao permanente licitacoes ata dispensa ju...,train,comissao permanente licitacoes ata dispensa ju...,"[[2, 3, 4, 5, 6, 7, 8, 2, 3, 9, 10, 11, 2, 9, ..."
1,56a09c5d1d04cc95ada4d68ad22dcbd9,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,test,comissao permanente licitacoes dia fevereiro r...,"[[2, 3, 4, 101, 109, 8, 2, 3, 9, 10, 11, 2, 9,..."
2,6e9f35208290c1130fcccad47b50aa53,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,praca cel centro estado cep pabx cnpj comissa...,praca cel centro estado cep pabx cnpj comissa...,praca cel centro estado cep pabx cnpj comissa...,praca cel centro estado cep pabx cnpj comissa...,train,praca cel centro estado cep pabx cnpj comissao...,"[[182, 183, 120, 184, 185, 186, 45, 2, 3, 4, 8..."
3,669c029c5812ec9a31b9c211044157b4,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,comissao permanente licitacoes dia fevereiro r...,train,comissao permanente licitacoes dia fevereiro r...,"[[2, 3, 4, 101, 109, 8, 2, 3, 9, 10, 11, 2, 9,..."
4,a22421dc45d623c9200c7e8fb1e6ca34,ATA,ata dispensa licitacao,0,cristais,../data/290-licitacoes-cristais/data/files_jso...,comissao permanente licitacoes maio reuniu com...,comissao permanente licitacoes maio reuniu com...,comissao permanente licitacoes maio reuniu com...,comissao permanente licitacoes maio reuniu com...,train,comissao permanente licitacoes maio reuniu com...,"[[2, 3, 4, 235, 8, 2, 3, 9, 10, 236, 94, 12, 1..."


In [30]:
def encode_sentence(text, vocab2index, N=1000):
    print("-"*20,' TEXT ', "-"*20, "\n")
    print(text)
    tokenized = tokenize(text)
    print("-"*20,' TOKENIZE ', "-"*20, "\n")
    print(tokenized)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    print("-"*20,' ENCODED ', "-"*20, "\n")
    print(enc1)
    length = min(N, len(enc1))
    print("-"*20,' LENGTH ', "-"*20, "\n")
    print(length)
    encoded[:length] = enc1[:length]
    print("-"*20,' FINAL ', "-"*20, "\n")
    print(encoded)
    return encoded, length

In [33]:
df_data.loc[100,['four_pages_processed']].apply(lambda x: np.array(encode_sentence(x,vocab2index )))

--------------------  TEXT  -------------------- 

comissao permanente licitacoes dias abril reuniu comissao permanente licitacao presidente membros comissao licitacao rogana edinamara nomeados portaria janeiro apos procedida avaliacao dispensa licitacao prestacao servicos sonorizacao mecanica trio eletrico evento festivo sabado aleluia praca jose ferreira filho dia abril cotados servicos seguintes empresas fernando silva cnpj endereco av maringa bairro maringa boa esperanca mg valor global quatro mil quinhentos reais elielson eliazar cnpj endereco rua beline moreira maia centro mg valor global tres mil novecentos reais cnpj endereco rua centro candeias mg valor global quatro mil oitocentos reais aberta etapa julgamento chegou entao seguinte resultado final elielson eliazar cnpj endereco beline moreira maia centro mg seq item descricao marca un qtd valor unitario valor total servicos sonorizacao mecanica sv locacao trio eletrico servicos djs locucao segurancas incluso alimentacao trans

  """Entry point for launching an IPython kernel.


four_pages_processed    [[2, 3, 4, 315, 444, 8, 2, 3, 9, 10, 11, 2, 9,...
Name: 100, dtype: object

In [43]:
df_data.loc[df_data['four_pages'].str.contains("zoom")]

Unnamed: 0,doc_id,final_meta-class,label,label_int,city,file_dir,one_page,two_pages,three_pages,four_pages,fold,four_pages_processed,four_pages_encoded
365,6aa821e1e916cac34caf106284ff38b8,ATA,ata registro precos,3,coqueiral,../data/289-licitacoes-coqueiral/data/files_js...,ata registro precos processo administrativo l...,ata registro precos processo administrativo l...,ata registro precos processo administrativo l...,ata registro precos processo administrativo l...,val,ata registro precos processo administrativo li...,"[[5, 724, 140, 449, 520, 465, 1501, 505, 1586,..."
3605,e58afaaca85d24ec10e23325b331fa80,OUTROS,outros,12,sao_bento_abade,../data/381-licitacoes-sao-bento-abade/data/fi...,cnpj fone email pregao presencial anexo termo...,cnpj fone email pregao presencial anexo termo...,cnpj fone email pregao presencial anexo termo...,cnpj fone email pregao presencial anexo termo...,test,cnpj fone email pregao presencial anexo termo ...,"[[45, 1581, 1584, 505, 1586, 32, 165, 1350, 46..."
3975,4ee123087d9273f8965d5c6948eb21c7,OUTROS,contrato,14,coqueiral,../data/289-licitacoes-coqueiral/data/files_js...,contrato administrativo processo administrati...,contrato administrativo processo administrati...,contrato administrativo processo administrati...,contrato administrativo processo administrati...,val,contrato administrativo processo administrativ...,"[[132, 520, 449, 520, 465, 1501, 505, 1586, 38..."
