In [None]:
# Instalação de dependências
%pip install -r requirements.txt

In [1]:
import tensorflow
import pandas as pd
from typing import List, Tuple
import numpy as np

In [2]:
# Initialize the random number generator
import random
random.seed(0)

# Ignore the warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Definição de caminhos dos arquivos de corpus
train_file = "Penn Treebank/Secs0-18 - training"
dev_file   = "Penn Treebank/Secs19-21 - development"
test_file  = "Penn Treebank/Secs22-24 - testing"

In [4]:
# ------------------------------
# Funções de pré-processamento do texto
def carregar_corpus(caminho_arquivo: str) -> str:
    """
    Lê o arquivo completo em utf-8 e retorna como string.
    """
    with open(caminho_arquivo, "r", encoding="utf-8") as f:
        return f.read()

def dividir_em_sentencas(texto: str) -> List[str]:
    """
    Divide o texto em sentenças, assumindo uma sentença por linha.
    """
    return texto.strip().split("\n")

def processar_sentenca(sentenca: str) -> List[Tuple[str, str]]:
    """
    Separa tokens de formato palavra_TAG em pares (palavra, tag).
    Converte para lowercase, exceto nomes próprios (NNP, NNPS).
    """
    tokens = sentenca.strip().split()
    pares = []
    for token in tokens:
        if "_" in token:
            palavra, tag = token.rsplit("_", 1)
            if not(tag == 'NNP' or tag == 'NNPS'):
                palavra = palavra.lower()
            pares.append((palavra, tag))
    return pares

def construir_dataframe(sentencas: List[str]) -> pd.DataFrame:
    """
    Cria um DataFrame 'longo' com colunas:
    sentenca (ID), palavra, tag e posicao_na_sentenca.
    """
    dados = []
    for sent_id, sentenca in enumerate(sentencas):
        palavras_tags = processar_sentenca(sentenca)
        for posicao, (palavra, tag) in enumerate(palavras_tags):
            dados.append({
                "sentenca": sent_id + 1,
                "palavra": palavra,
                "tag": tag,
                "posicao_na_sentenca": posicao
            })

    return pd.DataFrame(dados)

In [5]:
# Carregando e processando os datasets
texto_raw_train = carregar_corpus(train_file)
sentencas_train = dividir_em_sentencas(texto_raw_train)
df_treino = construir_dataframe(sentencas_train)
df_treino.fillna(method="ffill", inplace=True)

# Dev + Test combinados para avaliação final
texto_raw_dev = carregar_corpus(dev_file)
texto_raw_teste = carregar_corpus(test_file)
sentencas_teste = dividir_em_sentencas(texto_raw_dev + texto_raw_teste)
df_teste = construir_dataframe(sentencas_teste)
df_teste.fillna(method="ffill", inplace=True)

# Primeiras linhas
df_treino.head()

Unnamed: 0,sentenca,palavra,tag,posicao_na_sentenca
0,1,Pierre,NNP,0
1,1,Vinken,NNP,1
2,1,",",",",2
3,1,61,CD,3
4,1,years,NNS,4


In [6]:
df_treino.shape

(912344, 4)

In [7]:
df_treino.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912344 entries, 0 to 912343
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   sentenca             912344 non-null  int64 
 1   palavra              912344 non-null  object
 2   tag                  912344 non-null  object
 3   posicao_na_sentenca  912344 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 27.8+ MB


In [8]:
tags = list(set(df_treino["tag"].values))

In [9]:
tags

['PDT',
 '#',
 'NN',
 'WDT',
 'JJR',
 'JJS',
 '.',
 'CD',
 'MD',
 '``',
 'LS',
 'CC',
 'TO',
 '$',
 'JJ',
 'VBD',
 'PRP$',
 "''",
 'PRP',
 'RBS',
 'SYM',
 'VBZ',
 'WP$',
 'UH',
 'POS',
 'RB',
 ':',
 '-LRB-',
 'VB',
 'EX',
 'DT',
 '-RRB-',
 'VBP',
 'WP',
 'NNS',
 'NNPS',
 'RP',
 'FW',
 'NNP',
 'RBR',
 'VBN',
 'WRB',
 'VBG',
 ',',
 'IN']

In [10]:
palavras = list(set(df_treino["palavra"].values))
palavras.append("<PAD>") # Padding

In [11]:
palavras

['pursue',
 'crab',
 '830.5',
 'Pasquale',
 '99.93',
 'farm-machine',
 'factored',
 'Bookman',
 'diGenova',
 'Dedham',
 'attorneys',
 'land-rich',
 'inflation-adjusted',
 'cuisine',
 'pilloried',
 'hydroelectric',
 'KPMG',
 'inter-office',
 'Hilton',
 'geometric',
 'horse-breeding',
 'signed',
 'visitors',
 'cart',
 'Levinson',
 '20th-century',
 'sooner',
 'fertilization',
 'restroom',
 'toned',
 'furloughs',
 'intensified',
 'Jersey',
 'Bekaa',
 'capital-markets',
 'Shaffer',
 'marble-encased',
 'spared',
 'sino-u.s.',
 'Dallas-Barcelona',
 'oil-rig',
 'cathode-ray',
 'Elco',
 'near-panic',
 '1.35',
 'transforming',
 'Kuse',
 'devoured',
 'Batangas',
 'top-performing',
 'jamaican',
 '230,000',
 'noncommercial',
 'detroit-based',
 'purple',
 'hens',
 'meat-processing',
 'wistful',
 '820.4',
 'wrap',
 'handling',
 'Pincus',
 'offhandedly',
 '17.50',
 '130.13',
 'yield',
 'generate',
 'Telerate',
 'Lai',
 'flush',
 'tenor',
 'parody',
 'Professor',
 'point-of-sale',
 'building-society',


In [12]:
class LerSentencas(object): 
    
    def __init__(self, dados):
        self.dados = dados
        self.vazio = False
        agg_func = lambda s: [(w, p) for w, p in zip(s["palavra"].values.tolist(),
                                                           s["tag"].values.tolist())]
        self.agrupado = self.dados.groupby("sentenca").apply(agg_func)
        self.sentencas = [s for s in self.agrupado]

In [13]:
sentencas_treino = LerSentencas(df_treino).sentencas

In [14]:
sentencas_treino[0]

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [15]:
# Convertendo palavras e tags em números
word2id = {w: i for i, w in enumerate(palavras)}
tag2id = {t: i for i, t in enumerate(tags)}

In [16]:
word2id

{'pursue': 0,
 'crab': 1,
 '830.5': 2,
 'Pasquale': 3,
 '99.93': 4,
 'farm-machine': 5,
 'factored': 6,
 'Bookman': 7,
 'diGenova': 8,
 'Dedham': 9,
 'attorneys': 10,
 'land-rich': 11,
 'inflation-adjusted': 12,
 'cuisine': 13,
 'pilloried': 14,
 'hydroelectric': 15,
 'KPMG': 16,
 'inter-office': 17,
 'Hilton': 18,
 'geometric': 19,
 'horse-breeding': 20,
 'signed': 21,
 'visitors': 22,
 'cart': 23,
 'Levinson': 24,
 '20th-century': 25,
 'sooner': 26,
 'fertilization': 27,
 'restroom': 28,
 'toned': 29,
 'furloughs': 30,
 'intensified': 31,
 'Jersey': 32,
 'Bekaa': 33,
 'capital-markets': 34,
 'Shaffer': 35,
 'marble-encased': 36,
 'spared': 37,
 'sino-u.s.': 38,
 'Dallas-Barcelona': 39,
 'oil-rig': 40,
 'cathode-ray': 41,
 'Elco': 42,
 'near-panic': 43,
 '1.35': 44,
 'transforming': 45,
 'Kuse': 46,
 'devoured': 47,
 'Batangas': 48,
 'top-performing': 49,
 'jamaican': 50,
 '230,000': 51,
 'noncommercial': 52,
 'detroit-based': 53,
 'purple': 54,
 'hens': 55,
 'meat-processing': 56,
 '

In [17]:
tag2id

{'PDT': 0,
 '#': 1,
 'NN': 2,
 'WDT': 3,
 'JJR': 4,
 'JJS': 5,
 '.': 6,
 'CD': 7,
 'MD': 8,
 '``': 9,
 'LS': 10,
 'CC': 11,
 'TO': 12,
 '$': 13,
 'JJ': 14,
 'VBD': 15,
 'PRP$': 16,
 "''": 17,
 'PRP': 18,
 'RBS': 19,
 'SYM': 20,
 'VBZ': 21,
 'WP$': 22,
 'UH': 23,
 'POS': 24,
 'RB': 25,
 ':': 26,
 '-LRB-': 27,
 'VB': 28,
 'EX': 29,
 'DT': 30,
 '-RRB-': 31,
 'VBP': 32,
 'WP': 33,
 'NNS': 34,
 'NNPS': 35,
 'RP': 36,
 'FW': 37,
 'NNP': 38,
 'RBR': 39,
 'VBN': 40,
 'WRB': 41,
 'VBG': 42,
 ',': 43,
 'IN': 44}

In [18]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 50
X = [[word2id[w[0]] for w in s] for s in sentencas_treino]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=len(palavras)-1)
y = [[tag2id[w[1]] for w in s] for s in sentencas_treino]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2id["."])

In [19]:
from tensorflow.keras.utils import to_categorical
y = [to_categorical(i, num_classes=len(tags)) for i in y]

In [20]:
y[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [21]:
X[0]

array([ 8449, 36072, 35377, 18072,  7512, 20533, 35377, 32919, 38975,
       15613, 25411, 26772,   471,  9539,  8184, 19180,  3076, 32492,
       41600, 41600, 41600, 41600, 41600, 41600, 41600, 41600, 41600,
       41600, 41600, 41600, 41600, 41600, 41600, 41600, 41600, 41600,
       41600, 41600, 41600, 41600, 41600, 41600, 41600, 41600, 41600,
       41600, 41600, 41600, 41600, 41600], dtype=int32)

In [22]:
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.20)

In [23]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Input

In [24]:
input = Input(shape=(max_len,)) # Camada de Input
modelo = Embedding(input_dim=len(palavras), output_dim=50, input_length=max_len)(input) # Camada de Word embedding
modelo = Dropout(0.3)(modelo) # Camada de Dropout
modelo = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(modelo) # Camada de Bidirectional LSTM
out = TimeDistributed(Dense(len(tags), activation="softmax"))(modelo)  # Camada de softmax output

In [25]:
modelo = Model(input, out) # Modelo completo

In [26]:
modelo.summary()

In [27]:
modelo.compile(
    optimizer="rmsprop", 
    loss="categorical_crossentropy", 
    metrics=["accuracy"]
)

In [28]:
history = modelo.fit(X_tr, 
    np.array(y_tr), 
    batch_size=16, 
    epochs=3, 
    validation_split=0.2, 
    verbose=1
)

Epoch 1/3
[1m1529/1529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 39ms/step - accuracy: 0.7571 - loss: 0.9060 - val_accuracy: 0.9606 - val_loss: 0.1439
Epoch 2/3
[1m1529/1529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 36ms/step - accuracy: 0.9637 - loss: 0.1292 - val_accuracy: 0.9732 - val_loss: 0.0936
Epoch 3/3
[1m1529/1529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 35ms/step - accuracy: 0.9761 - loss: 0.0844 - val_accuracy: 0.9781 - val_loss: 0.0760


In [29]:
import pickle

# Salvar o modelo treinado
with open('lstm_model_1.pkl', 'wb') as f:
    pickle.dump(modelo, f)

print("Modelo salvo com sucesso em 'lstm_model_1.pkl'")

Modelo salvo com sucesso em 'lstm_model_1.pkl'


In [34]:
i = 1213
p = modelo.predict(np.array([X_te[i]])) # Predição
p = np.argmax(p, axis=-1) # Mapear softmax de volta para um índice POS
for w, pred in zip(X_te[i], p[0]): # Para cada palavra na sentença
    print("{:20} -- {}".format(palavras[w], tags[pred])) # Imprimir palavra e tag

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
the                  -- DT
tentative            -- JJ
agreement            -- NN
provides             -- VBZ
for                  -- IN
wage                 -- NN
increases            -- NNS
of                   -- IN
85                   -- CD
cents                -- NNS
an                   -- DT
hour                 -- NN
retroactive          -- NN
to                   -- TO
Sept.                -- NNP
25                   -- CD
,                    -- ,
1989                 -- CD
,                    -- ,
and                  -- CC
for                  -- IN
increases            -- NNS
of                   -- IN
19                   -- CD
cents                -- NNS
,                    -- ,
70                   -- CD
cents                -- NNS
and                  -- CC
35                   -- CD
cents                -- NNS
an                   -- DT
hour                 -- NN
effective            -- JJ
Jan. 

In [36]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [37]:
from nltk import word_tokenize

sentenca = word_tokenize('That was a nice jump')
# Substituir palavras desconhecidas pelo índice da palavra desconhecida
X_Samp = pad_sequences(maxlen=max_len, sequences=[[word2id.get(word, len(palavras)-1) for word in sentenca]], padding="post", value=len(palavras)-1)

In [38]:
p = modelo.predict(np.array([X_Samp[0]])) # Predict on it
p = np.argmax(p, axis=-1) # Map softmax back to a POS index
for w, pred in zip(X_Samp[0], p[0]): # for every word in the sentence
    print("{:20} -- {}".format(palavras[w], tags[pred])) # Print word and tag

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
<PAD>                -- .
was                  -- VBD
a                    -- DT
nice                 -- JJ
jump                 -- NN
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>                -- .
<PAD>        