#  Processamento de Linguagem natural (PLN) - PARTE 2

In [None]:
#############################################################################################################
##### Notebook Processamento de Linguagem natural (PLN)
##### Baseado em:
## Natural Language Processing with Python (book)
##
##############################################################################################################
## Objetivos:
##   Mostrar aplicações de metodos de linguagem natural aprendidos em aula
###################################################################################################################

## Importação dos Dados

In [14]:
# !pip install datasets
import datasets
import pandas as pd
from datasets import load_dataset 

In [None]:
pd.set_option('display.max_colwidth', None)  # Mostra o conteúdo completo das colunas
pd.set_option('display.max_columns', 100)  # Mostra o conteúdo completo das colunas
pd.set_option('display.max_rows', 100)  # Mostra o conteúdo completo das colunas

In [29]:
dataset = datasets.load_dataset("amazon_polarity",  split="test[:400]")  # Carrega apenas 1% do conjunto de teste para visualização rápida
print(dataset)

# Conversão para DataFrame do Pandas
df = dataset.to_pandas()[["title"]]
df.tail()

Dataset({
    features: ['label', 'title', 'content'],
    num_rows: 400
})


Unnamed: 0,title
395,Curved Thomas the Train tracks
396,O-o-okay
397,Heads above the first book in the trilogy
398,Good book
399,Fantastic Sci Fi Classic


In [None]:
# Vai dar pra brincar com os textos dos títulos
media_titulo = dataset["title"].str.len().median()
print("Mediana do tamanho dos titulos:", int(media_titulo))

Mediana do tamanho dos titulos: 25


In [35]:
# Tokenização para dar início aos trabalhos
import nltk

df['tokens'] = df['title'].str.lower().apply(nltk.word_tokenize)
df.head()


Unnamed: 0,title,tokens
0,Great CD,"[great, cd]"
1,One of the best game music soundtracks - for a game I didn't really play,"[one, of, the, best, game, music, soundtracks, -, for, a, game, i, did, n't, really, play]"
2,Batteries died within a year ...,"[batteries, died, within, a, year, ...]"
3,"works fine, but Maha Energy is better","[works, fine, ,, but, maha, energy, is, better]"
4,Great for the non-audiophile,"[great, for, the, non-audiophile]"


## Modelo de Linguagem - Bigramas

In [None]:
import nltk

def createBigramFromDF(df, token_col):
    listOfBigrams = []
    bigramCounts = {}
    unigramCounts = {}

    for tokens in df[token_col]:
        for i in range(len(tokens) - 1):
            # Forma o bigrama
            bigram = (tokens[i], tokens[i + 1])
            listOfBigrams.append(bigram)
            # Soma os bigramas e unigramas nos dicionários
            bigramCounts[bigram] = bigramCounts.get(bigram, 0) + 1
            unigramCounts[tokens[i]] = unigramCounts.get(tokens[i], 0) + 1
        # Conta o último unigramo
        if tokens:
            unigramCounts[tokens[-1]] = unigramCounts.get(tokens[-1], 0) + 1

    return listOfBigrams, unigramCounts, bigramCounts

def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):
    listOfProb = {}
    for bigram in listOfBigrams:
        word1 = bigram[0]
        listOfProb[bigram] = bigramCounts.get(bigram) / unigramCounts.get(word1)
    return listOfProb

In [39]:
listOfBigrams, unigramCounts, bigramCounts = createBigramFromDF(df, token_col="tokens")

In [63]:
print("\n Todos os bigramas possiveis sao: ")
print(listOfBigrams)

print("\n Unigramas e suas frequencias: ")
print(unigramCounts)

print("\n Bigramas e suas frequencias: ")
print(bigramCounts)

print("\n Probabilidade dos bigramas:")
bigramProb = calcBigramProb(listOfBigrams, unigramCounts, bigramCounts)
print(bigramProb)


 Todos os bigramas possiveis sao: 
[('great', 'cd'), ('one', 'of'), ('of', 'the'), ('the', 'best'), ('best', 'game'), ('game', 'music'), ('music', 'soundtracks'), ('soundtracks', '-'), ('-', 'for'), ('for', 'a'), ('a', 'game'), ('game', 'i'), ('i', 'did'), ('did', "n't"), ("n't", 'really'), ('really', 'play'), ('batteries', 'died'), ('died', 'within'), ('within', 'a'), ('a', 'year'), ('year', '...'), ('works', 'fine'), ('fine', ','), (',', 'but'), ('but', 'maha'), ('maha', 'energy'), ('energy', 'is'), ('is', 'better'), ('great', 'for'), ('for', 'the'), ('the', 'non-audiophile'), ('dvd', 'player'), ('player', 'crapped'), ('crapped', 'out'), ('out', 'after'), ('after', 'one'), ('one', 'year'), ('incorrect', 'disc'), ('dvd', 'menu'), ('menu', 'select'), ('select', 'problems'), ('unique', 'weird'), ('weird', 'orientalia'), ('orientalia', 'from'), ('from', 'the'), ('the', '1930'), ('1930', "'s"), ('not', 'an'), ('an', '``'), ('``', 'ultimate'), ('ultimate', 'guide'), ('guide', "''"), ('gre

## Smoothing

Smoothing é uma técnica usada em modelos de linguagem para lidar com o problema de zero probabilidade (usa o n-gramas).


In [None]:
from collections import defaultdict
from collections import Counter
from numpy.random import choice 
from tqdm import tqdm

class Bigram():
    def __init__(self):
        self.bigram_counts = defaultdict(Counter)
        self.unigram_counts = Counter()
        self.context = defaultdict(Counter)
        self.start_count = 0
        self.token_count = 0
        self.vocab_count = 0
    
    def convert_sentence(self, sentence):
        # Lista de tokens, com <s> e </s> no início e fim
        return ["<s>"] + [w.lower() for w in sentence] + ["</s>"]
    
    def get_counts(self, sentences):
        # collect unigram counts
        for sentence in sentences:
            sentence = self.convert_sentence(sentence)
            for word in sentence[1:]:  # from 1, because we don't need the <s> token
                # Conta as ocorrências de cada palavra
                self.unigram_counts[word] += 1
            self.start_count += 1
            
        # collect bigram counts
        for sentence in sentences:
            sentence = self.convert_sentence(sentence)
            bigram_list = zip(sentence[:-1], sentence[1:])
            for bigram in bigram_list:
                self.bigram_counts[bigram[0]][bigram[1]] += 1
                # contagem reversa, útil para suavização, pois mostra o contexto de cada palavra
                self.context[bigram[1]][bigram[0]] += 1 
        self.token_count = sum(self.unigram_counts.values())
        self.vocab_count = len(self.unigram_counts.keys())
        
    def generate_sentence(self):
        # Incialização
        current_word = "<s>"
        sentence = [current_word]

        # Geração da sentença 
        while current_word != "</s>":
            prev_word = current_word
            prev_word_counts = self.bigram_counts[prev_word] # C(wi|wi-1) 
            # obtain bigram probability distribution given the previous word
            bigram_probs = []
            total_counts = float(sum(prev_word_counts.values())) # soma das contagens do wi-1
            for word in prev_word_counts:
                bigram_probs.append(prev_word_counts[word] / total_counts)
            # sample the next word
            current_word = choice(list(prev_word_counts.keys()), p=bigram_probs)
            sentence.append(current_word)
            
        sentence = " ".join(sentence[1:-1])
        return sentence

In [79]:
# Geração de senteças, a partir da classe criada
bigram = Bigram()
bigram.get_counts(df.tokens)

for i in range(1,6):
    print(f"Sentence {i}")
    print(bigram.generate_sentence())

Sentence 1
avoid it started eating tapes
Sentence 2
unbearable lightness of the haiku year
Sentence 3
i would say
Sentence 4
sony has a very good quality cell phone holder
Sentence 5
lots of god , and terrific reference for the perfect cd


In [None]:
import math
from random import shuffle

def split_train_test(data):
    # data consite de uma lista de listas de tokens
    sentences = list(data)
    shuffle(sentences)
    cutoff = int(0.8*len(sentences))
    # Normalzação para minúsculas e separação
    training_set = [[word.lower() for word in sent] for sent in sentences[:cutoff]]
    test_set = [[word.lower() for word in sent] for sent in sentences[cutoff:]]
    return training_set, test_set

# Medida de quão bem um modelo de linguagem prevê uma amostra, quanto menor o valor, melhor o modelo
def calculate_perplexity(sentences, bigram, smoothing_function):
    total_log_prob = 0
    test_token_count = 0
    for sentence in tqdm(sentences):
        test_token_count += len(sentence) + 1 # para considerar o token <fim-de-arquivo>
        total_log_prob += smoothing_function(sentence, bigram)
    return math.exp(-total_log_prob / test_token_count)


training_set, test_set = split_train_test(df.tokens)
print("Tamanho do conjunto de treinamento:", len(training_set))
print(training_set)

Tamanho do conjunto de treinamento: 320
[['disappointed'], ['shame', 'on', 'you', 'mr.', 'keel', '!', '!', '!'], ['a', 'must', 'have', '!'], ['itten', 'the', 'elements', 'of', 'color'], ['hell', 'no', '.'], ['the', 'description', 'said', 'stephen', 'gammell', 'as', 'artist', 'and', 'it', 'was', 'not', '...'], ['mary', 'ash'], ['usa1'], ['great', 'book', '--', 'unacceptable', 'condition'], ['my', '[', 'goodness', ']', '!', '!', '!', '!'], ['almost', 'perfect', 'cd'], ['good', 'matt', 'nude', 'lipstick'], ['waste', 'of', 'money', '!'], ['impressive'], ['amazingly', 'soft', 'hair', '!'], ['real', 'html', 'for', 'people', 'who', 'build', 'sites', 'for', 'a', 'living'], ['disappointed'], ['sony', 'hi8', 'camcorder', 'with', '2.5', 'lcd'], ['predictable', '&', 'has', 'a', 'lame', 'ending'], ['one', 'of', 'the', 'last', 'in', 'the', 'series', 'to', 'collect', '!'], ['hard', '-to-find', 'tension', 'rods'], ['junk', '!'], ['stuck', 'with', 'you'], ['thomas', 'connector', 'tracks'], ['unreadable

In [None]:
# Definicao da função de smoothing
def laplacian_smoothing(sentence, bigram):
    sentence = bigram.convert_sentence(sentence)
    bigram_list = zip(sentence[:-1], sentence[1:])
    prob = 0

    for prev_word, word in bigram_list:
        # Contagem reversa para suavização
        sm_bigram_counts = bigram.bigram_counts[prev_word][word] + 1
        # Inicio de sentença
        if prev_word == "<s>": 
            sm_unigram_counts = bigram.start_count
        else: 
            sm_unigram_counts = bigram.unigram_counts[prev_word] + len(bigram.unigram_counts)
        prob += math.log(sm_bigram_counts / sm_unigram_counts)
    return prob

In [94]:
# Execucao principal

bigram_laplacian_smoothing = Bigram()
bigram_laplacian_smoothing.get_counts(training_set) # cria o modelo
plex_laplacian_smoothing = calculate_perplexity(test_set, bigram_laplacian_smoothing, laplacian_smoothing)
print(plex_laplacian_smoothing)

100%|██████████| 80/80 [00:00<00:00, 13917.80it/s]

379.2862634709625





## POS tagging

Pos_tagging usando HMM (Hidden Markov Model)

Serve para etiquetar palavras em uma sentença com suas respectivas classes gramaticais (substantivo, verbo, adjetivo, etc.)

In [102]:
# !pip install spacy
# !python -m spacy download en_core_web_sm
import spacy
from nltk import word_tokenize

tagger = spacy.load("en_core_web_sm")

In [None]:
# Aplicando o POS tagging no título dos produtos
df["tagged"] = df["title"].apply(lambda sent: [(word.text.lower(), word.pos_) for word in tagger(sent)])
df.head()

Unnamed: 0,title,tokens,tagged
0,Great CD,"[great, cd]","[(great, ADJ), (cd, NOUN)]"
1,One of the best game music soundtracks - for a game I didn't really play,"[one, of, the, best, game, music, soundtracks, -, for, a, game, i, did, n't, really, play]","[(one, NUM), (of, ADP), (the, DET), (best, ADJ), (game, NOUN), (music, NOUN), (soundtracks, NOUN), (-, PUNCT), (for, ADP), (a, DET), (game, NOUN), (i, PRON), (did, AUX), (n't, PART), (really, ADV), (play, VERB)]"
2,Batteries died within a year ...,"[batteries, died, within, a, year, ...]","[(batteries, NOUN), (died, VERB), (within, ADP), (a, DET), (year, NOUN), (..., PUNCT)]"
3,"works fine, but Maha Energy is better","[works, fine, ,, but, maha, energy, is, better]","[(works, VERB), (fine, ADJ), (,, PUNCT), (but, CCONJ), (maha, PROPN), (energy, PROPN), (is, AUX), (better, ADJ)]"
4,Great for the non-audiophile,"[great, for, the, non-audiophile]","[(great, ADJ), (for, ADP), (the, DET), (non, ADJ), (-, ADJ), (audiophile, ADJ)]"


## Algoritmo de Viterbi

O *algoritmo de Viterbi* é um algoritmo dinâmico usado para encontrar a sequência mais provável de estados ocultos (tags) em um modelo de Markov oculto (HMM), dado uma sequência observada (palavras). Muito usado para POS tagging

In [None]:
def viterbi(obs, states, start_p, trans_p, emit_p):
    # tabuleiro Viterbi, para armazenar probabilidades
    V = [{}]
    for st in states:
        emission = emit_p[st].get(obs[0], 1e-8)
        V[0][st] = {"prob": start_p.get(st, 1e-8) * emission, "prev": None}
    for t in range(1, len(obs)):
        V.append({})
        for st in states:
            # Probabilidade de transição máxima para cada estado
            max_tr_prob = V[t - 1][states[0]]["prob"] * trans_p.get(states[0], {}).get(st, 1e-8) * emit_p[st].get(obs[t], 1e-8)
            prev_st_selected = states[0]
            for prev_st in states[1:]:
                # p_trans = p_anterior * p_transição * p_emissão
                tr_prob = V[t - 1][prev_st]["prob"] * trans_p.get(prev_st, {}).get(st, 1e-8) * emit_p[st].get(obs[t], 1e-8)
                # atualiza
                if tr_prob > max_tr_prob:
                    max_tr_prob = tr_prob
                    prev_st_selected = prev_st
            max_prob = max_tr_prob
            # armazena a probabilidade máxima e o estado anterior no tabuleiro
            V[t][st] = {"prob": max_prob, "prev": prev_st_selected}

    for line in dptable(V):
        print(line)
        
    # Reconstrução da sequência de estados mais provável
    opt = []
    max_prob = 0.0
    best_st = None

    # Get most probable state and its backtrack
    for st, data in V[-1].items():
        if data["prob"] > max_prob:
            max_prob = data["prob"]
            best_st = st
    opt.append(best_st)
    previous = best_st

    # Follow the backtrack till the first observation
    for t in range(len(V) - 2, -1, -1):
        opt.insert(0, V[t + 1][previous]["prev"])
        previous = V[t + 1][previous]["prev"]

    print("A sequencia de estados foi: " + " ".join(opt) + " tendo probabilidade de %s" % max_prob + " (maior probabilidade)")


def dptable(V):
    # Print a table of steps from dictionary
    yield " " * 5 + "     ".join(("%3d" % i) for i in range(len(V)))
    for state in V[0]:
        yield "%.7s: " % state + " ".join("%.7s" % ("%lf" % v[state]["prob"]) for v in V)



In [None]:
import spacy
from collections import Counter, defaultdict

# Inicialização dos contadores
start_counts = Counter()
transition_counts = defaultdict(Counter)
emission_counts = defaultdict(Counter)
tag_counts = Counter()

# Criação da tabela de contagens, para calcular as probabilidades
for tagged_sent in df["tagged"]:
    tags = [tag for _, tag in tagged_sent]
    words = [word for word, _ in tagged_sent]
    if tags:  # Evita sentenças vazias
        # Contadores
        start_counts[tags[0]] += 1
        for word, tag in zip(words, tags):
            emission_counts[tag][word] += 1
            tag_counts[tag] += 1
        for i in range(len(tags) - 1):
            transition_counts[tags[i]][tags[i+1]] += 1


# Probabilidades
total_sents = len(df)
start_probabilities = {tag: count/total_sents for tag, count in start_counts.items()}
transition_probabilities = {tag1: {tag2: count/sum(transition_counts[tag1].values())
                                  for tag2, count in transition_counts[tag1].items()}
                           for tag1 in transition_counts}
emission_probabilities = {tag: {word: count/tag_counts[tag]
                               for word, count in emission_counts[tag].items()}
                         for tag in emission_counts}

# Lista de estados
states = list(tag_counts.keys())

print("Estados:", states)
print("Probabilidades de Incio:", start_probabilities)
print("Probabilidades de Trasicao:", transition_probabilities)
print("Probabilidades de Emissao:", emission_probabilities)

Estados: ['ADJ', 'NOUN', 'NUM', 'ADP', 'DET', 'PUNCT', 'PRON', 'AUX', 'PART', 'ADV', 'VERB', 'CCONJ', 'PROPN', 'INTJ', 'SCONJ', 'SYM', 'X']
Probabilidades de Incio: {'ADJ': 0.2125, 'NUM': 0.0225, 'NOUN': 0.1075, 'VERB': 0.0825, 'PROPN': 0.1975, 'PART': 0.035, 'DET': 0.1075, 'INTJ': 0.025, 'AUX': 0.035, 'PRON': 0.0825, 'ADP': 0.0125, 'ADV': 0.06, 'SCONJ': 0.0075, 'PUNCT': 0.01, 'CCONJ': 0.0025}
Probabilidades de Trasicao: {'ADJ': {'NOUN': 0.5989010989010989, 'PUNCT': 0.13736263736263737, 'ADP': 0.06593406593406594, 'ADJ': 0.016483516483516484, 'CCONJ': 0.06593406593406594, 'SCONJ': 0.016483516483516484, 'PROPN': 0.06043956043956044, 'ADV': 0.005494505494505495, 'X': 0.005494505494505495, 'VERB': 0.005494505494505495, 'PRON': 0.01098901098901099, 'PART': 0.005494505494505495, 'NUM': 0.005494505494505495}, 'NUM': {'ADP': 0.16666666666666666, 'NOUN': 0.20833333333333334, 'PART': 0.08333333333333333, 'PROPN': 0.16666666666666666, 'ADJ': 0.08333333333333333, 'PUNCT': 0.08333333333333333, 'CC

In [123]:
# Exemplo
example_sentence = "great product fast shipping"
obs = nltk.word_tokenize(example_sentence.lower())

# Chama o Viterbi
viterbi(obs, states, start_probabilities, transition_probabilities, emission_probabilities)

       0       1       2       3
ADJ: 0.02826 0.00000 0.00000 0.00000
NOUN: 0.00000 0.00028 0.00000 0.00000
NUM: 0.00000 0.00000 0.00000 0.00000
ADP: 0.00000 0.00000 0.00000 0.00000
DET: 0.00000 0.00000 0.00000 0.00000
PUNCT: 0.00000 0.00000 0.00000 0.00000
PRON: 0.00000 0.00000 0.00000 0.00000
AUX: 0.00000 0.00000 0.00000 0.00000
PART: 0.00000 0.00000 0.00000 0.00000
ADV: 0.00000 0.00000 0.00000 0.00000
VERB: 0.00000 0.00000 0.00000 0.00000
CCONJ: 0.00000 0.00000 0.00000 0.00000
PROPN: 0.00236 0.00000 0.00000 0.00000
INTJ: 0.00000 0.00000 0.00000 0.00000
SCONJ: 0.00000 0.00000 0.00000 0.00000
SYM: 0.00000 0.00000 0.00000 0.00000
X: 0.00000 0.00000 0.00000 0.00000
A sequencia de estados foi: ADJ NOUN ADP NOUN tendo probabilidade de 3.2583537262126886e-16 (maior probabilidade)
