In [8]:
#############################################################################################################
##### Notebook Processamento de Linguagem natural (PLN)
##### Baseado em:
## Natural Language Processing with Python (book)
##
##############################################################################################################
## Objetivos:
##   Mostrar varios metodos de linguagem natural utilizando Python
###################################################################################################################

In [9]:
################################################
### 01 - Modelos de Linguagem - Bigramas
################################################

def readData():
    data = ['Este eh um cachorro','Este eh um gato','Eu amo meu gato','Este eh meu nome']
    dat=[]
    for i in range(len(data)):
        for word in data[i].split():
            dat.append(word)
    #print(dat)
    return dat

def createBigram(data):
   listOfBigrams = []
   bigramCounts = {}
   unigramCounts = {}
   for i in range(len(data)-1):
      if i < len(data) - 1 and data[i+1].islower():

         listOfBigrams.append((data[i], data[i + 1]))

         if (data[i], data[i+1]) in bigramCounts:
            bigramCounts[(data[i], data[i + 1])] += 1
         else:
            bigramCounts[(data[i], data[i + 1])] = 1

      if data[i] in unigramCounts:
         unigramCounts[data[i]] += 1
      else:
         unigramCounts[data[i]] = 1
   return listOfBigrams, unigramCounts, bigramCounts


def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):
    listOfProb = {}
    for bigram in listOfBigrams:
        word1 = bigram[0]
        word2 = bigram[1]
        listOfProb[bigram] = (bigramCounts.get(bigram))/(unigramCounts.get(word1))
    return listOfProb


In [10]:
data = readData()
listOfBigrams, unigramCounts, bigramCounts = createBigram(data)

print("\n Todos os bigramas possiveis sao: ")
print(listOfBigrams)

print("\n Bigramas e suas frequencias: ")
print(bigramCounts)

print("\n Unigramas e suas frequencias: ")
print(unigramCounts)

bigramProb = calcBigramProb(listOfBigrams, unigramCounts, bigramCounts)

print("\n Bigramas e suas probabilidades: ")
print(bigramProb)
inputList="Este eh meu gato"
splt=inputList.split()
outputProb1 = 1
bilist=[]
bigrm=[]

for i in range(len(splt) - 1):
    if i < len(splt) - 1:
        bilist.append((splt[i], splt[i + 1]))
    
print("\n Os bigramas na sentenca de entrada sao: ")
print(bilist)
for i in range(len(bilist)):
    if bilist[i] in bigramProb:
        outputProb1 *= bigramProb[bilist[i]]
    else:
        outputProb1 *= 0
print('\n' + 'Probabilidade da sentenca '+inputList+ '= ' + str(outputProb1))


 Todos os bigramas possiveis sao: 
[('Este', 'eh'), ('eh', 'um'), ('um', 'cachorro'), ('Este', 'eh'), ('eh', 'um'), ('um', 'gato'), ('Eu', 'amo'), ('amo', 'meu'), ('meu', 'gato'), ('Este', 'eh'), ('eh', 'meu'), ('meu', 'nome')]

 Bigramas e suas frequencias: 
{('Este', 'eh'): 3, ('eh', 'um'): 2, ('um', 'cachorro'): 1, ('um', 'gato'): 1, ('Eu', 'amo'): 1, ('amo', 'meu'): 1, ('meu', 'gato'): 1, ('eh', 'meu'): 1, ('meu', 'nome'): 1}

 Unigramas e suas frequencias: 
{'Este': 3, 'eh': 3, 'um': 2, 'cachorro': 1, 'gato': 2, 'Eu': 1, 'amo': 1, 'meu': 2}

 Bigramas e suas probabilidades: 
{('Este', 'eh'): 1.0, ('eh', 'um'): 0.6666666666666666, ('um', 'cachorro'): 0.5, ('um', 'gato'): 0.5, ('Eu', 'amo'): 1.0, ('amo', 'meu'): 1.0, ('meu', 'gato'): 0.5, ('eh', 'meu'): 0.3333333333333333, ('meu', 'nome'): 0.5}

 Os bigramas na sentenca de entrada sao: 
[('Este', 'eh'), ('eh', 'meu'), ('meu', 'gato')]

Probabilidade da sentenca Este eh meu gato= 0.16666666666666666


In [11]:
################################################
### 02 - Smoothing
################################################

In [12]:
from collections import defaultdict
from collections import Counter
from numpy.random import choice 
from tqdm import tqdm

class Bigram():
    def __init__(self):
        self.bigram_counts = defaultdict(Counter)
        self.unigram_counts = Counter()
        self.context = defaultdict(Counter)
        self.start_count = 0
        self.token_count = 0
        self.vocab_count = 0
    
    def convert_sentence(self, sentence):
        return ["<s>"] + [w.lower() for w in sentence] + ["</s>"]
    
    def get_counts(self, sentences):
        # collect unigram counts
        for sentence in sentences:
            sentence = self.convert_sentence(sentence)
            for word in sentence[1:]:  # from 1, because we don't need the <s> token
                self.unigram_counts[word] += 1
            self.start_count += 1
            
        # collect bigram counts
        for sentence in sentences:
            sentence = self.convert_sentence(sentence)
            bigram_list = zip(sentence[:-1], sentence[1:])
            for bigram in bigram_list:
                self.bigram_counts[bigram[0]][bigram[1]] += 1
                self.context[bigram[1]][bigram[0]] += 1
        self.token_count = sum(self.unigram_counts.values())
        self.vocab_count = len(self.unigram_counts.keys())
        
    def generate_sentence(self):
        current_word = "<s>"
        sentence = [current_word]
        while current_word != "</s>":
            prev_word = current_word
            prev_word_counts = self.bigram_counts[prev_word]
            # obtain bigram probability distribution given the previous word
            bigram_probs = []
            total_counts = float(sum(prev_word_counts.values()))
            for word in prev_word_counts:
                bigram_probs.append(prev_word_counts[word] / total_counts)
            # sample the next word
            current_word = choice(list(prev_word_counts.keys()), p=bigram_probs)
            sentence.append(current_word)
            
        sentence = " ".join(sentence[1:-1])
        return sentence


In [None]:
import nltk
from nltk.corpus import brown
nltk.download('brown') # Corpus com um milhao de palavras de textos americanos lancados em 1961

bigram = Bigram()
bigram.get_counts(brown.sents())
for i in range(1,6):
    print("Sentence %d" % i)
    print(bigram.generate_sentence())

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Masmok\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


Sentence 1
eighteenth-century england , fair .
Sentence 2
Sentence 3
however , and arthur williams's might logically , separately , you own order , either a surface absurdity of unobtrusive use of his present pool is authorized for hanover ) , whining voice so that -- idleness and prompt time for a surface craft is flesh , bud freeman `` a day before the emergence of the long experience at hand every movement until mid-june .
Sentence 4
afterwards , clutching the hypocritical self-aggrandizement .
Sentence 5
it was fifteen dollars under that a difference in the body of legends that the long-range ballistic missile .


In [14]:
import math
from random import shuffle

def split_train_test():
    sents = list(brown.sents())
    shuffle(sents)
    cutoff = int(0.8*len(sents))
    training_set = sents[:cutoff]
    test_set = [[word.lower() for word in sent] for sent in sents[cutoff:]]
    return training_set, test_set

def calculate_perplexity(sentences, bigram, smoothing_function, parameter):
    total_log_prob = 0
    test_token_count = 0
    for sentence in tqdm(sentences):
        test_token_count += len(sentence) + 1 # para considerar o token <fim-de-arquivo>
        total_log_prob += smoothing_function(sentence, bigram, parameter)
    return math.exp(-total_log_prob / test_token_count)

training_set, test_set = split_train_test()

In [15]:
# Definicao da funcao de smoothing

def laplacian_smoothing(sentence, bigram, parameter):
    sentence = bigram.convert_sentence(sentence)
    bigram_list = zip(sentence[:-1], sentence[1:])
    prob = 0
    for prev_word, word in bigram_list:
        sm_bigram_counts = bigram.bigram_counts[prev_word][word] + 1
        if prev_word == "<s>": sm_unigram_counts = bigram.start_count
        else: sm_unigram_counts = bigram.unigram_counts[prev_word] + len(bigram.unigram_counts)
        prob += math.log(sm_bigram_counts / sm_unigram_counts)
    return prob

In [16]:
# Execucao principal

bigram_laplacian_smoothing = Bigram()
bigram_laplacian_smoothing.get_counts(training_set) # cria o modelo
plex_laplacian_smoothing = calculate_perplexity(test_set, bigram_laplacian_smoothing, laplacian_smoothing, None)
print(plex_laplacian_smoothing)

100%|██████████| 11468/11468 [00:00<00:00, 15363.37it/s]

3490.949863099908





In [19]:
################################################
### 03 - PoS tagging
################################################

# baixar o modelo: POS_tagger_brill.pkl
# POS_tagger_bigram.pkl (tentar outros)
# https://github.com/inoueMashuu/POS-tagger-portuguese-nltk/tree/master/trained_POS_taggers


import joblib
from nltk import word_tokenize

teste_tagger = joblib.load('..\Datasets\POS_tagger_brill.pkl')
phrase = 'O rato roeu a roupa do rei de Roma'
teste_tagger.tag(word_tokenize(phrase))

[('O', 'ART'),
 ('rato', 'N'),
 ('roeu', 'V'),
 ('a', 'ART'),
 ('roupa', 'N'),
 ('do', 'KS'),
 ('rei', 'N'),
 ('de', 'PREP'),
 ('Roma', 'NPROP')]

In [20]:
################################################
### 04 - Viterbi algorithm
################################################

def viterbi(obs, states, start_p, trans_p, emit_p):
    V = [{}]
    for st in states:
        V[0] [st] = {"prob": start_p[st] * emit_p[st] [obs[0]], "prev": None}
    # Run Viterbi when t > 0
    for t in range(1, len(obs)):
        V.append({})
        for st in states:
            max_tr_prob = V[t - 1] [states[0]] ["prob"] * trans_p[states[0]] [st] * emit_p[st] [obs[t]]
            prev_st_selected = states[0]
            for prev_st in states[1:]:
                tr_prob = V[t - 1] [prev_st] ["prob"] * trans_p[prev_st] [st] * emit_p[st] [obs[t]]
                if tr_prob > max_tr_prob:
                    max_tr_prob = tr_prob
                    prev_st_selected = prev_st

            max_prob = max_tr_prob
            V[t] [st] = {"prob": max_prob, "prev": prev_st_selected}

    for line in dptable(V):
        print(line)

    opt = []
    max_prob = 0.0
    best_st = None
    # Get most probable state and its backtrack
    for st, data in V[-1].items():
        if data["prob"] > max_prob:
            max_prob = data["prob"]
            best_st = st
    opt.append(best_st)
    previous = best_st

    # Follow the backtrack till the first observation
    for t in range(len(V) - 2, -1, -1):
        opt.insert(0, V[t + 1] [previous] ["prev"])
        previous = V[t + 1] [previous] ["prev"]

    print ("A sequencia de estados foi: " + " ".join(opt) + " tendo probabilidade de %s" % max_prob + " (maior probabilidade)")

def dptable(V):
    # Print a table of steps from dictionary
    yield " " * 5 + "     ".join(("%3d" % i) for i in range(len(V)))
    for state in V[0]:
        yield "%.7s: " % state + " ".join("%.7s" % ("%lf" % v[state] ["prob"]) for v in V)



In [21]:
import numpy as np

# Define the states (POS tags) and observation symbols (words)
states = ['NOUN', 'VERB', 'ADJ']
observations = ['dog', 'runs', 'fast']

# Define the transition probabilities
transition_probabilities = {
    'NOUN': {'NOUN': 0.1, 'VERB': 0.7, 'ADJ': 0.2},
    'VERB': {'NOUN': 0.3, 'VERB': 0.4, 'ADJ': 0.3},
    'ADJ': {'NOUN': 0.5, 'VERB': 0.1, 'ADJ': 0.4}
}

# Define the emission probabilities
emission_probabilities = {
    'NOUN': {'dog': 0.6, 'runs': 0.1, 'fast': 0.3},
    'VERB': {'dog': 0.1, 'runs': 0.7, 'fast': 0.2},
    'ADJ': {'dog': 0.3, 'runs': 0.2, 'fast': 0.5}
}

# Define the initial probabilities
start_probabilities = {'NOUN': 0.4, 'VERB': 0.3, 'ADJ': 0.3}

viterbi(observations,
           states,
           start_probabilities,
           transition_probabilities,
           emission_probabilities)

       0       1       2
NOUN: 0.24000 0.00450 0.01058
VERB: 0.03000 0.11760 0.00940
ADJ: 0.09000 0.00960 0.01764
A sequencia de estados foi: NOUN VERB ADJ tendo probabilidade de 0.017639999999999996 (maior probabilidade)


In [22]:
states = ('Chovendo', 'Ensolarado')

observations = ('Andar', 'Comprar', 'LimparCasa')

start_probability = {'Chovendo': 0.6, 'Ensolarado': 0.4}

transition_probability = {
'Chovendo' : {'Chovendo': 0.7, 'Ensolarado': 0.3},
'Ensolarado' : {'Chovendo': 0.4, 'Ensolarado': 0.6},
}

emission_probability = {
'Chovendo' : {'Andar': 0.1, 'Comprar': 0.4, 'LimparCasa': 0.5},
'Ensolarado' : {'Andar': 0.6, 'Comprar': 0.3, 'LimparCasa': 0.1},
}

viterbi(observations,
           states,
           start_probability,
           transition_probability,
           emission_probability)

       0       1       2
Chovend: 0.06000 0.03840 0.01344
Ensolar: 0.24000 0.04320 0.00259
A sequencia de estados foi: Ensolarado Chovendo Chovendo tendo probabilidade de 0.01344 (maior probabilidade)


In [23]:
################################################
### 05 - Engenharia de caracteristicas
################################################

def count_chars(text):
    return len(text)

def count_words(text):
    return len(text.split())

def count_capital_letters(text):
    return sum(1 for char in text if char.isupper())

def count_capital_words(text):
    return sum(map(str.isupper,text.split()))

def count_punctuations(text):
# retorna um dicinario com 32 pontuacoes e contadores associados
    punctuations="!#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
    d=dict()
    for i in punctuations:
        d[str(i)+' count']=text.count(i)
    return d

def count_words_in_quotes(text):
    x = re.findall('"([^"]*)"', text)
    count=0
    if x is None:
        return 0
    else:
        for i in x:
            t=i[1:-1]
            count+=count_words(t)
        return count

def count_sent(text):
    return len(nltk.sent_tokenize(text))

def count_unique_words(text):
    return len(set(text.split()))

def count_htags(text):
    x = re.findall(r'(#w[A-Za-z0-9]*)', text)
    return len(x) 

def count_mentions(text):
    x = re.findall(r'(@w[A-Za-z0-9]*)', text)
    return len(x)

def count_stopwords(text):
    stop_words = set(stopwords.words('portuguese'))  
    word_tokens = word_tokenize(text)
    stopwords_x = [w for w in word_tokens if w in stop_words]
    return len(stopwords_x)



In [24]:
import nltk
from nltk.corpus import stopwords

text = "Ó mar salgado, quanto do teu sal, são lágrimas de Portugal! Por te cruzarmos, quantas mães choraram, quantos filhos em vão rezaram! Quantas noivas ficaram por casar, para que fosses nosso, ó mar! Valeu a pena? Tudo vale a pena, se a alma não é pequena. Quem quer passar além do Bojador, tem que passar além da dor. Deus ao mar o perigo e o abismo deu. Mas nele é que espelhou o céu."

dict_features = {}

dict_features['char_count'] = count_chars(text)
dict_features['word_count'] = count_words(text)
dict_features['stopword_count'] = count_stopwords(text)
dict_features['unique_word_count'] = count_unique_words(text)
dict_features['sent_count'] = count_sent(text)

dict_features['avg_wordlength'] = int(dict_features['char_count'])/int(dict_features['word_count'])
dict_features['avg_sentlength'] = dict_features['word_count']/dict_features['sent_count']
dict_features['unique_vs_words'] = dict_features['unique_word_count']/dict_features['word_count']
dict_features['stopwords_vs_words'] = dict_features['stopword_count']/dict_features['word_count']

In [25]:
dict_features

{'char_count': 382,
 'word_count': 74,
 'stopword_count': 27,
 'unique_word_count': 63,
 'sent_count': 8,
 'avg_wordlength': 5.162162162162162,
 'avg_sentlength': 9.25,
 'unique_vs_words': 0.8513513513513513,
 'stopwords_vs_words': 0.36486486486486486}