### Preparação do ambiente

In [None]:
# Instalação de dependências
%pip install -r requirements.txt

In [82]:
import tensorflow
import pandas as pd
from typing import List, Tuple
import numpy as np

In [83]:
# Initialize the random number generator
import random
random.seed(0)

# Ignore the warnings
import warnings
warnings.filterwarnings("ignore")

### Loading e pre processamento dos dados

In [84]:
# Definição de caminhos dos arquivos de corpus
train_file = "Penn Treebank/Secs0-18 - training"
dev_file   = "Penn Treebank/Secs19-21 - development"
test_file  = "Penn Treebank/Secs22-24 - testing"

In [85]:
# ------------------------------
# Funções de pré-processamento do texto
def carregar_corpus(caminho_arquivo: str) -> str:
    """
    Lê o arquivo completo em utf-8 e retorna como string.
    """
    with open(caminho_arquivo, "r", encoding="utf-8") as f:
        return f.read()

def dividir_em_sentencas(texto: str) -> List[str]:
    """
    Divide o texto em sentenças, assumindo uma sentença por linha.
    """
    return texto.strip().split("\n")

def processar_sentenca(sentenca: str) -> List[Tuple[str, str]]:
    """
    Separa tokens de formato palavra_TAG em pares (palavra, tag).
    Converte para lowercase, exceto nomes próprios (NNP, NNPS).
    """
    tokens = sentenca.strip().split()
    pares = []
    for token in tokens:
        if "_" in token:
            palavra, tag = token.rsplit("_", 1)
            if not(tag == 'NNP' or tag == 'NNPS'):
                palavra = palavra.lower()
            pares.append((palavra, tag))
    return pares

def construir_dataframe(sentencas: List[str]) -> pd.DataFrame:
    """
    Cria um DataFrame 'longo' com colunas:
    sentenca (ID), palavra, tag e posicao_na_sentenca.
    """
    dados = []
    for sent_id, sentenca in enumerate(sentencas):
        palavras_tags = processar_sentenca(sentenca)
        for posicao, (palavra, tag) in enumerate(palavras_tags):
            dados.append({
                "sentenca": sent_id + 1,
                "palavra": palavra,
                "tag": tag,
                "posicao_na_sentenca": posicao
            })

    return pd.DataFrame(dados)

In [86]:
# Carregando e processando os datasets
texto_raw_train = carregar_corpus(train_file)
texto_raw_dev = carregar_corpus(dev_file)
texto_raw_teste = carregar_corpus(test_file)
sentencas_train = dividir_em_sentencas(texto_raw_train + texto_raw_dev + texto_raw_teste)
df_treino = construir_dataframe(sentencas_train)
df_treino.fillna(method="ffill", inplace=True)

# Primeiras linhas
df_treino.head()

Unnamed: 0,sentenca,palavra,tag,posicao_na_sentenca
0,1,Pierre,NNP,0
1,1,Vinken,NNP,1
2,1,",",",",2
3,1,61,CD,3
4,1,years,NNS,4


In [87]:
df_treino.shape

(1173766, 4)

In [88]:
df_treino.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1173766 entries, 0 to 1173765
Data columns (total 4 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   sentenca             1173766 non-null  int64 
 1   palavra              1173766 non-null  object
 2   tag                  1173766 non-null  object
 3   posicao_na_sentenca  1173766 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 35.8+ MB


#### Vocabulário e lista de tags

In [89]:
tags = list(set(df_treino["tag"].values))

In [90]:
tags

['PDT',
 '#',
 'NN',
 'WDT',
 'JJR',
 'JJS',
 '.',
 'CD',
 'MD',
 '``',
 'LS',
 'CC',
 'TO',
 '$',
 'JJ',
 'VBD',
 'PRP$',
 "''",
 'PRP',
 'RBS',
 'SYM',
 'VBZ',
 'WP$',
 'UH',
 'POS',
 'RB',
 ':',
 '-LRB-',
 'VB',
 'EX',
 'DT',
 '-RRB-',
 'VBP',
 'WP',
 'NNS',
 'NNPS',
 'RP',
 'FW',
 'NNP',
 'RBR',
 'VBN',
 'WRB',
 'VBG',
 ',',
 'IN']

In [91]:
palavras = list(set(df_treino["palavra"].values))
palavras.append("<PAD>") # Padding

In [92]:
palavras

['pursue',
 'crab',
 '830.5',
 'Pasquale',
 '99.93',
 'farm-machine',
 'factored',
 'Bookman',
 'diGenova',
 'Dedham',
 '64.1',
 'attorneys',
 'Robinson-Humphrey',
 'land-rich',
 'inflation-adjusted',
 'cuisine',
 'pilloried',
 'hydroelectric',
 'couplets',
 'Chateauvallon',
 'KPMG',
 'inter-office',
 'Medicis',
 'Hinzack',
 'Hilton',
 'geometric',
 'horse-breeding',
 'signed',
 'visitors',
 'cart',
 'Levinson',
 '20th-century',
 'sooner',
 'fertilization',
 'restroom',
 'toned',
 'furloughs',
 'Lerach',
 'intensified',
 'Jersey',
 'Bekaa',
 'capital-markets',
 'Shaffer',
 'marble-encased',
 'spared',
 'sino-u.s.',
 'Dallas-Barcelona',
 'oil-rig',
 'cathode-ray',
 'Tashkent',
 'Elco',
 'near-panic',
 '1.35',
 'transforming',
 'Kuse',
 'devoured',
 'Batangas',
 'top-performing',
 'jamaican',
 '230,000',
 'noncommercial',
 'detroit-based',
 'purple',
 'hens',
 'meat-processing',
 'wistful',
 '820.4',
 'wrap',
 'handling',
 'Pincus',
 'offhandedly',
 '17.50',
 '130.13',
 'yield',
 'genera

#### Separar em sentenças

In [93]:
class LerSentencas(object): 
    
    def __init__(self, dados):
        self.dados = dados
        self.vazio = False
        agg_func = lambda s: [(w, p) for w, p in zip(s["palavra"].values.tolist(),
                                                           s["tag"].values.tolist())]
        self.agrupado = self.dados.groupby("sentenca").apply(agg_func)
        self.sentencas = [s for s in self.agrupado]

In [94]:
sentencas_treino = LerSentencas(df_treino).sentencas

In [95]:
sentencas_treino[0]

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

### Geração de embeddings

In [96]:
# Convertendo palavras e tags em números
word2id = {w: i for i, w in enumerate(palavras)}
tag2id = {t: i for i, t in enumerate(tags)}

In [97]:
word2id

{'pursue': 0,
 'crab': 1,
 '830.5': 2,
 'Pasquale': 3,
 '99.93': 4,
 'farm-machine': 5,
 'factored': 6,
 'Bookman': 7,
 'diGenova': 8,
 'Dedham': 9,
 '64.1': 10,
 'attorneys': 11,
 'Robinson-Humphrey': 12,
 'land-rich': 13,
 'inflation-adjusted': 14,
 'cuisine': 15,
 'pilloried': 16,
 'hydroelectric': 17,
 'couplets': 18,
 'Chateauvallon': 19,
 'KPMG': 20,
 'inter-office': 21,
 'Medicis': 22,
 'Hinzack': 23,
 'Hilton': 24,
 'geometric': 25,
 'horse-breeding': 26,
 'signed': 27,
 'visitors': 28,
 'cart': 29,
 'Levinson': 30,
 '20th-century': 31,
 'sooner': 32,
 'fertilization': 33,
 'restroom': 34,
 'toned': 35,
 'furloughs': 36,
 'Lerach': 37,
 'intensified': 38,
 'Jersey': 39,
 'Bekaa': 40,
 'capital-markets': 41,
 'Shaffer': 42,
 'marble-encased': 43,
 'spared': 44,
 'sino-u.s.': 45,
 'Dallas-Barcelona': 46,
 'oil-rig': 47,
 'cathode-ray': 48,
 'Tashkent': 49,
 'Elco': 50,
 'near-panic': 51,
 '1.35': 52,
 'transforming': 53,
 'Kuse': 54,
 'devoured': 55,
 'Batangas': 56,
 'top-perfor

In [98]:
tag2id

{'PDT': 0,
 '#': 1,
 'NN': 2,
 'WDT': 3,
 'JJR': 4,
 'JJS': 5,
 '.': 6,
 'CD': 7,
 'MD': 8,
 '``': 9,
 'LS': 10,
 'CC': 11,
 'TO': 12,
 '$': 13,
 'JJ': 14,
 'VBD': 15,
 'PRP$': 16,
 "''": 17,
 'PRP': 18,
 'RBS': 19,
 'SYM': 20,
 'VBZ': 21,
 'WP$': 22,
 'UH': 23,
 'POS': 24,
 'RB': 25,
 ':': 26,
 '-LRB-': 27,
 'VB': 28,
 'EX': 29,
 'DT': 30,
 '-RRB-': 31,
 'VBP': 32,
 'WP': 33,
 'NNS': 34,
 'NNPS': 35,
 'RP': 36,
 'FW': 37,
 'NNP': 38,
 'RBR': 39,
 'VBN': 40,
 'WRB': 41,
 'VBG': 42,
 ',': 43,
 'IN': 44}

In [99]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 50
X = [[word2id[w[0]] for w in s] for s in sentencas_treino]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=len(palavras)-1)
y = [[tag2id[w[1]] for w in s] for s in sentencas_treino]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2id["."])

In [100]:
from tensorflow.keras.utils import to_categorical
y = [to_categorical(i, num_classes=len(tags)) for i in y]

In [101]:
y[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [102]:
X[0]

array([ 9561, 41022, 40239, 20522,  8485, 23330, 40239, 37455, 44335,
       17736, 28894, 30439,   525, 10816,  9249, 21793,  3475, 36968,
       47343, 47343, 47343, 47343, 47343, 47343, 47343, 47343, 47343,
       47343, 47343, 47343, 47343, 47343, 47343, 47343, 47343, 47343,
       47343, 47343, 47343, 47343, 47343, 47343, 47343, 47343, 47343,
       47343, 47343, 47343, 47343, 47343], dtype=int32)

#### Separação em conjunto de treino e teste

In [103]:
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.20)

### Definição de arquiteturas

In [105]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Input

#### LSTM

In [106]:
input_uni = Input(shape=(max_len,)) # Camada de Input
modelo_uni = Embedding(input_dim=len(palavras), output_dim=100, input_length=max_len)(input_uni) # Camada de Word embedding com dimensão maior
modelo_uni = Dropout(0.2)(modelo_uni) # Camada de Dropout com taxa menor
modelo_uni = LSTM(units=64, return_sequences=True, recurrent_dropout=0.2)(modelo_uni) # Camada de LSTM unidirecional com mais unidades
modelo_uni = Dropout(0.3)(modelo_uni) # Camada de Dropout adicional
modelo_uni = LSTM(units=32, return_sequences=True, recurrent_dropout=0.1)(modelo_uni) # Segunda camada LSTM com menos unidades
modelo_uni = Dense(128, activation='relu')(modelo_uni) # Camada densa intermediária
modelo_uni = Dropout(0.2)(modelo_uni) # Dropout após camada densa
out_uni = TimeDistributed(Dense(len(tags), activation="softmax"))(modelo_uni)  # Camada de softmax output

In [107]:
modelo_uni = Model(input_uni, out_uni) # Modelo completo

In [108]:
modelo_uni.summary()

In [109]:
modelo_uni.compile(
    optimizer="rmsprop", 
    loss="categorical_crossentropy", 
    metrics=["accuracy"]
)

##### Treino

In [110]:
history = modelo_uni.fit(X_tr, 
    np.array(y_tr),
    batch_size=32, 
    epochs=3, 
    validation_split=0.2, 
    verbose=1,
)

Epoch 1/3
[1m985/985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 63ms/step - accuracy: 0.6655 - loss: 1.2249 - val_accuracy: 0.9187 - val_loss: 0.3035
Epoch 2/3
[1m985/985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 61ms/step - accuracy: 0.9199 - loss: 0.2850 - val_accuracy: 0.9637 - val_loss: 0.1324
Epoch 3/3
[1m985/985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 61ms/step - accuracy: 0.9605 - loss: 0.1403 - val_accuracy: 0.9699 - val_loss: 0.1045


##### Salvar Modelo

In [None]:
import pickle

# Salvar o modelo treinado
with open('lstm_uni_model_1.pkl', 'wb') as f:
    pickle.dump(modelo_uni, f)

print("Modelo salvo com sucesso em 'lstm_uni_model_1.pkl'")

##### Testes demonstrativos

In [None]:
i = 1213
p = modelo_uni.predict(np.array([X_te[i]])) # Predição
p = np.argmax(p, axis=-1) # Mapear softmax de volta para um índice POS
for w, pred in zip(X_te[i], p[0]): # Para cada palavra na sentença
    print("{:20} -- {}".format(palavras[w], tags[pred])) # Imprimir palavra e tag

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
from nltk import word_tokenize

sentenca = word_tokenize('That was a nice jump')
# Substituir palavras desconhecidas pelo índice da palavra desconhecida
X_Samp = pad_sequences(maxlen=max_len, sequences=[[word2id.get(word, len(palavras)-1) for word in sentenca]], padding="post", value=len(palavras)-1)

In [None]:
p = modelo_uni.predict(np.array([X_Samp[0]])) # Predict on it
p = np.argmax(p, axis=-1) # Map softmax back to a POS index
for w, pred in zip(X_Samp[0], p[0]): # for every word in the sentence
    print("{:20} -- {}".format(palavras[w], tags[pred])) # Print word and tag

#### Bidirectional LSTM

In [None]:
input_bi = Input(shape=(max_len,)) # Camada de Input
modelo_bi = Embedding(input_dim=len(palavras), output_dim=50, input_length=max_len)(input_bi) # Camada de Word embedding
modelo_bi = Dropout(0.3)(modelo_bi) # Camada de Dropout
modelo_bi = Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))(modelo_bi) # Camada de Bidirectional LSTM
out_bi = TimeDistributed(Dense(len(tags), activation="softmax"))(modelo_bi)  # Camada de softmax output

In [None]:
modelo_bi = Model(input_bi, out_bi) # Modelo completo

In [None]:
modelo_bi.summary()

In [None]:
modelo_bi.compile(
    optimizer="rmsprop", 
    loss="categorical_crossentropy", 
    metrics=["accuracy"]
)

##### Treino

In [None]:
history = modelo_bi.fit(X_tr, 
    np.array(y_tr), 
    batch_size=16, 
    epochs=3, 
    validation_split=0.2, 
    verbose=1
)

Epoch 1/3
[1m1969/1969[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 33ms/step - accuracy: 0.7756 - loss: 0.8499 - val_accuracy: 0.9650 - val_loss: 0.1230
Epoch 2/3
[1m1969/1969[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 31ms/step - accuracy: 0.9680 - loss: 0.1124 - val_accuracy: 0.9759 - val_loss: 0.0822
Epoch 3/3
[1m1969/1969[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 32ms/step - accuracy: 0.9780 - loss: 0.0757 - val_accuracy: 0.9789 - val_loss: 0.0698


##### Salvar Modelo

In [None]:
import pickle

# Salvar o modelo treinado
with open('lstm_bi_model_1.pkl', 'wb') as f:
    pickle.dump(modelo_bi, f)

print("Modelo salvo com sucesso em 'lstm_bi_model_1.pkl'")

Modelo salvo com sucesso em 'lstm_model_1.pkl'


##### Testes demonstrativos

In [None]:
i = 1213
p = modelo_bi.predict(np.array([X_te[i]])) # Predição
p = np.argmax(p, axis=-1) # Mapear softmax de volta para um índice POS
for w, pred in zip(X_te[i], p[0]): # Para cada palavra na sentença
    print("{:20} -- {}".format(palavras[w], tags[pred])) # Imprimir palavra e tag

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 678ms/step
the                  -- DT
two                  -- CD
men                  -- NNS
are                  -- VBP
longtime             -- JJ
friends              -- NNS
and                  -- CC
tennis               -- NN
partners             -- NNS
,                    -- ,
having               -- VBG
met                  -- VBN
about                -- IN
25                   -- CD
years                -- NNS
ago                  -- RB
.                    -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                

##### Teste com sentença aleatória

In [74]:
from nltk import word_tokenize

sentenca = word_tokenize('That was a nice jump')
# Substituir palavras desconhecidas pelo índice da palavra desconhecida
X_Samp = pad_sequences(maxlen=max_len, sequences=[[word2id.get(word, len(palavras)-1) for word in sentenca]], padding="post", value=len(palavras)-1)

In [None]:
p = modelo_bi.predict(np.array([X_Samp[0]])) # Predict on it
p = np.argmax(p, axis=-1) # Map softmax back to a POS index
for w, pred in zip(X_Samp[0], p[0]): # for every word in the sentence
    print("{:20} -- {}".format(palavras[w], tags[pred])) # Print word and tag

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
<PAD>                -- .
was                  -- VBD
a                    -- DT
nice                 -- JJ
jump                 -- NN
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>        