In [1]:
#############################################################################################################
##### Notebook Processamento de Linguagem natural (PLN)
##### Baseado em:
## Natural Language Processing with Python (book)
##
##############################################################################################################
## Objetivos:
##   Mostrar varios metodos de linguagem natural utilizando Python
###################################################################################################################

In [2]:
################################################
### 01 - Tokenizacao
################################################

from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import TreebankWordTokenizer

text = "This is Mary's car, isn't it?"
tk_list = []
tk_list.append(WhitespaceTokenizer()) 
tk_list.append(WordPunctTokenizer())
tk_list.append(TreebankWordTokenizer())

for tk in tk_list:
    result = tk.tokenize(text) 
    print(result)


['This', 'is', "Mary's", 'car,', "isn't", 'it?']
['This', 'is', 'Mary', "'", 's', 'car', ',', 'isn', "'", 't', 'it', '?']
['This', 'is', 'Mary', "'s", 'car', ',', 'is', "n't", 'it', '?']


In [3]:
### Tokenizacao em portugues

import nltk
from nltk import tokenize

text = "guarda-chuva Se a única coisa que de o homem terá certeza é a morte; a única certeza do brasileiro é o carnaval no próximo ano." # Graciliano Ramos

result = tokenize.word_tokenize(text, language='portuguese') 

print(result)

['guarda-chuva', 'Se', 'a', 'única', 'coisa', 'que', 'de', 'o', 'homem', 'terá', 'certeza', 'é', 'a', 'morte', ';', 'a', 'única', 'certeza', 'do', 'brasileiro', 'é', 'o', 'carnaval', 'no', 'próximo', 'ano', '.']


In [4]:
################################################
### 02 - Stemming
################################################

import nltk
from nltk.stem import PorterStemmer

ps = PorterStemmer()

example_words = ["program","programming","programer","programs","programmed"]

# Perform stemming
print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in example_words:
   print ("{0:20}{1:20}".format(word, ps.stem(word)))

--Word--            --Stem--            
program             program             
programming         program             
programer           program             
programs            program             
programmed          program             


In [5]:
example_words = ["programmers", "because", "people"]
print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in example_words:
   print ("{0:20}{1:20}".format(word, ps.stem(word)))


--Word--            --Stem--            
programmers         programm            
because             becaus              
people              peopl               


In [6]:
################################################
### 03 - Lemmanization
################################################

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

wnl = WordNetLemmatizer()
words = ['dogs', 'churches', 'aardwolves', 'abaci', 'hardrock']

for w in words:
    print(wnl.lemmatize(w))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Masmok\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


dog
church
aardwolf
abacus
hardrock


In [7]:
################################################
### 04 - Stopwords
################################################

#nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')
print(stopwords[:15])
len(stopwords)


['a', 'à', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', 'aquilo', 'as', 'às', 'até', 'com', 'como', 'da']


207

In [8]:
################################################
### 04 - Caracterizacao das palavras - Tfidf
################################################

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
texts = ["bad movie", "not a good movie", "did not like", "i like it", "good me"]

tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1 ,2))
features = tfidf.fit_transform(texts)
df = pd.DataFrame(
    features.todense(),
    columns=tfidf.get_feature_names_out()
)
print(df)

      good      like    movie       not
0  0.00000  0.000000  1.00000  0.000000
1  0.57735  0.000000  0.57735  0.577350
2  0.00000  0.707107  0.00000  0.707107
3  0.00000  1.000000  0.00000  0.000000
4  1.00000  0.000000  0.00000  0.000000


In [9]:
################################################
### Extra - Lembrando como fazer hashing
################################################

import hashlib

def hash_token(token, b):
    hash_object = hashlib.sha256()
    hash_object.update(token.encode()) # UTF-8 encode
    return int(hash_object.hexdigest(), 16) % (2**b)

# Example usage
b = 10  # Number of buckets for hashing
token = "dfadfasdfasdfasdfadfadfadfasdfasdfasdfa"
hashed_value = hash_token(token, b)
print(hashed_value)


972


In [11]:
################################################
### 05 - Caracterizacao das palavras - wordvector
################################################

from gensim.models import Word2Vec

# Sentencas
sentences = [["gato", "persegue", "rato"], ["cachorro", "late", "muito"], ["lobo", "uiva"]]

# Treinamento do modelo Word2Vec
model = Word2Vec(sentences, min_count=1) # ignora palavras com frequencia abaixo de...

vector = model.wv['gato']
print("Vetor representando 'gato':", vector)

# Find similar words
similar_words = model.wv.most_similar('gato')
print("Similaridade das palavras em relacao a 'gato 'gato':", similar_words)


Vetor representando 'gato': [ 8.1681199e-03 -4.4430327e-03  8.9854337e-03  8.2536647e-03
 -4.4352221e-03  3.0310510e-04  4.2744912e-03 -3.9263200e-03
 -5.5599655e-03 -6.5123225e-03 -6.7073823e-04 -2.9592158e-04
  4.4630850e-03 -2.4740540e-03 -1.7260908e-04  2.4618758e-03
  4.8675989e-03 -3.0808449e-05 -6.3394094e-03 -9.2608072e-03
  2.6657581e-05  6.6618943e-03  1.4660227e-03 -8.9665223e-03
 -7.9386048e-03  6.5519023e-03 -3.7856805e-03  6.2549924e-03
 -6.6810320e-03  8.4796622e-03 -6.5163244e-03  3.2880199e-03
 -1.0569858e-03 -6.7875278e-03 -3.2875966e-03 -1.1614120e-03
 -5.4709399e-03 -1.2113475e-03 -7.5633135e-03  2.6466595e-03
  9.0701487e-03 -2.3772502e-03 -9.7651005e-04  3.5135616e-03
  8.6650876e-03 -5.9218528e-03 -6.8875779e-03 -2.9329848e-03
  9.1476962e-03  8.6626766e-04 -8.6784009e-03 -1.4469790e-03
  9.4794659e-03 -7.5494875e-03 -5.3580985e-03  9.3165627e-03
 -8.9737261e-03  3.8259076e-03  6.6544057e-04  6.6607012e-03
  8.3127534e-03 -2.8507852e-03 -3.9923131e-03  8.8979173e

In [12]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
#nltk.download('punkt')

# Sample corpus
corpus = [
    "I like to eat apples",
    "I like bananas",
    "I enjoy eating oranges"
]

# Tokenize the corpus
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus]

# Train the Word2Vec model
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, sg=0)

# Get the word vector for a word
word_vector = model.wv["apples"]
print("Vector for 'apples':", word_vector)

# Find similar words
similar_words = model.wv.most_similar("apples")
print("Similar words to 'apples':", similar_words)

Vector for 'apples': [ 8.13227147e-03 -4.45733406e-03 -1.06835726e-03  1.00636482e-03
 -1.91113955e-04  1.14817743e-03  6.11386076e-03 -2.02715401e-05
 -3.24596534e-03 -1.51072862e-03  5.89729892e-03  1.51410222e-03
 -7.24261976e-04  9.33324732e-03 -4.92128357e-03 -8.38409644e-04
  9.17541143e-03  6.74942741e-03  1.50285603e-03 -8.88256077e-03
  1.14874600e-03 -2.28825561e-03  9.36823711e-03  1.20992784e-03
  1.49006362e-03  2.40640994e-03 -1.83600665e-03 -4.99963388e-03
  2.32429506e-04 -2.01418041e-03  6.60093315e-03  8.94012302e-03
 -6.74754381e-04  2.97701475e-03 -6.10765442e-03  1.69932481e-03
 -6.92623248e-03 -8.69402662e-03 -5.90020278e-03 -8.95647518e-03
  7.27759488e-03 -5.77203138e-03  8.27635173e-03 -7.24354526e-03
  3.42167495e-03  9.67499893e-03 -7.78544787e-03 -9.94505733e-03
 -4.32914635e-03 -2.68313056e-03 -2.71289347e-04 -8.83155130e-03
 -8.61755759e-03  2.80021061e-03 -8.20640661e-03 -9.06933658e-03
 -2.34046578e-03 -8.63180775e-03 -7.05664977e-03 -8.40115082e-03
 -3.

In [13]:
##################################################################################################################
##################################################################################################################
##################################################################################################################

In [14]:
def readData():
    data = ['This is a  dog','This is a cat','I love my cat','This is my name ']
    dat=[]
    for i in range(len(data)):
        for word in data[i].split():
            dat.append(word)
    print(dat)
    return dat

def createBigram(data):
   listOfBigrams = []
   bigramCounts = {}
   unigramCounts = {}
   for i in range(len(data)-1):
      if i < len(data) - 1 and data[i+1].islower():

         listOfBigrams.append((data[i], data[i + 1]))

         if (data[i], data[i+1]) in bigramCounts:
            bigramCounts[(data[i], data[i + 1])] += 1
         else:
            bigramCounts[(data[i], data[i + 1])] = 1

      if data[i] in unigramCounts:
         unigramCounts[data[i]] += 1
      else:
         unigramCounts[data[i]] = 1
   return listOfBigrams, unigramCounts, bigramCounts


def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):
    listOfProb = {}
    for bigram in listOfBigrams:
        word1 = bigram[0]
        word2 = bigram[1]
        listOfProb[bigram] = (bigramCounts.get(bigram))/(unigramCounts.get(word1))
    return listOfProb


if __name__ == '__main__':
    data = readData()
    listOfBigrams, unigramCounts, bigramCounts = createBigram(data)

    print("\n All the possible Bigrams are ")
    print(listOfBigrams)

    print("\n Bigrams along with their frequency ")
    print(bigramCounts)

    print("\n Unigrams along with their frequency ")
    print(unigramCounts)

    bigramProb = calcBigramProb(listOfBigrams, unigramCounts, bigramCounts)

    print("\n Bigrams along with their probability ")
    print(bigramProb)
    inputList="This is my cat"
    splt=inputList.split()
    outputProb1 = 1
    bilist=[]
    bigrm=[]

    for i in range(len(splt) - 1):
        if i < len(splt) - 1:

            bilist.append((splt[i], splt[i + 1]))

    print("\n The bigrams in given sentence are ")
    print(bilist)
    for i in range(len(bilist)):
        if bilist[i] in bigramProb:

            outputProb1 *= bigramProb[bilist[i]]
        else:

            outputProb1 *= 0
    print('\n' + 'Probablility of sentence \"This is my cat\" = ' + str(outputProb1))

['This', 'is', 'a', 'dog', 'This', 'is', 'a', 'cat', 'I', 'love', 'my', 'cat', 'This', 'is', 'my', 'name']

 All the possible Bigrams are 
[('This', 'is'), ('is', 'a'), ('a', 'dog'), ('This', 'is'), ('is', 'a'), ('a', 'cat'), ('I', 'love'), ('love', 'my'), ('my', 'cat'), ('This', 'is'), ('is', 'my'), ('my', 'name')]

 Bigrams along with their frequency 
{('This', 'is'): 3, ('is', 'a'): 2, ('a', 'dog'): 1, ('a', 'cat'): 1, ('I', 'love'): 1, ('love', 'my'): 1, ('my', 'cat'): 1, ('is', 'my'): 1, ('my', 'name'): 1}

 Unigrams along with their frequency 
{'This': 3, 'is': 3, 'a': 2, 'dog': 1, 'cat': 2, 'I': 1, 'love': 1, 'my': 2}

 Bigrams along with their probability 
{('This', 'is'): 1.0, ('is', 'a'): 0.6666666666666666, ('a', 'dog'): 0.5, ('a', 'cat'): 0.5, ('I', 'love'): 1.0, ('love', 'my'): 1.0, ('my', 'cat'): 0.5, ('is', 'my'): 0.3333333333333333, ('my', 'name'): 0.5}

 The bigrams in given sentence are 
[('This', 'is'), ('is', 'my'), ('my', 'cat')]

Probablility of sentence "This is 

In [17]:
import joblib
from nltk import word_tokenize

teste_tagger = joblib.load('..\Datasets\POS_tagger_brill.pkl')
phrase = 'O rato roeu a roupa do rei de Roma'
teste_tagger.tag(word_tokenize(phrase))


[('O', 'ART'),
 ('rato', 'N'),
 ('roeu', 'V'),
 ('a', 'ART'),
 ('roupa', 'N'),
 ('do', 'KS'),
 ('rei', 'N'),
 ('de', 'PREP'),
 ('Roma', 'NPROP')]

In [18]:
import nltk
from nltk import word_tokenize
import numpy as np
import joblib

nltk.download('mac_morpho')

[nltk_data] Downloading package mac_morpho to
[nltk_data]     C:\Users\Masmok\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\mac_morpho.zip.


True

In [19]:
dataset = list(nltk.corpus.mac_morpho.tagged_sents())

In [20]:
dataset[500]

[('Tinha', 'VAUX'),
 ('sido', 'PCP'),
 ('apresentado', 'PCP'),
 ('por', 'PREP|+'),
 ('a', 'ART'),
 ('criadora', 'N'),
 ('Simone', 'NPROP'),
 ('Nowak', 'NPROP')]

In [21]:
tot = len(dataset)
tot_train_samples = int(np.ceil(tot*.8))

train_data = dataset[:tot_train_samples]
test_data = dataset[tot_train_samples:]

In [22]:
t_def = nltk.DefaultTagger('N')
t_affix2 = nltk.AffixTagger(train_data, affix_length=-2, backoff=t_def)
t_affix3 = nltk.AffixTagger(train_data, affix_length=-3, backoff=t_affix2)
t_affix4 = nltk.AffixTagger(train_data, affix_length=-4, backoff=t_affix3)
t_affix5 = nltk.AffixTagger(train_data, affix_length=-5, backoff=t_affix4)
t_affix6 = nltk.AffixTagger(train_data, affix_length=-6, backoff=t_affix5)

acc_def = t_def.evaluate(test_data) * 100
acc_af2 = t_affix2.evaluate(test_data) * 100
acc_af3 = t_affix3.evaluate(test_data) * 100
acc_af4 = t_affix4.evaluate(test_data) * 100
acc_af5 = t_affix5.evaluate(test_data) * 100
acc_af6 = t_affix6.evaluate(test_data) * 100

print('''Performance dos taggers:
         - Default:                     {:.2f}%
         - Sufixo tamanho 2 + Default:  {:.2f}%
         - Sufixo tamanho 3 + Sufixo 2: {:.2f}%
         - Sufixo tamanho 4 + Sufixo 3: {:.2f}%
         - Sufixo tamanho 5 + Sufixo 4: {:.2f}%
         - Sufixo tamanho 6 + Sufixo 5: {:.2f}%'''.format(acc_def, acc_af2, acc_af3,
                                                          acc_af4, acc_af5, acc_af6))

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  acc_def = t_def.evaluate(test_data) * 100
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  acc_af2 = t_affix2.evaluate(test_data) * 100
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  acc_af3 = t_affix3.evaluate(test_data) * 100
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  acc_af4 = t_affix4.evaluate(test_data) * 100
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  acc_af5 = t_affix5.evaluate(test_data) * 100
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  acc_af6 = t_affix6.evaluate(test_data) * 100


Performance dos taggers:
         - Default:                     19.68%
         - Sufixo tamanho 2 + Default:  27.29%
         - Sufixo tamanho 3 + Sufixo 2: 32.23%
         - Sufixo tamanho 4 + Sufixo 3: 34.66%
         - Sufixo tamanho 5 + Sufixo 4: 36.24%
         - Sufixo tamanho 6 + Sufixo 5: 36.71%


In [23]:
t_def = nltk.DefaultTagger('N')
t_affix2 = nltk.AffixTagger(train_data, affix_length=-2, backoff=t_def)
t_affix3 = nltk.AffixTagger(train_data, affix_length=-3, backoff=t_affix2)
t_affix4 = nltk.AffixTagger(train_data, affix_length=-4, backoff=t_affix3)
t_affix5 = nltk.AffixTagger(train_data, affix_length=-5, backoff=t_affix4)
t_affix6 = nltk.AffixTagger(train_data, affix_length=-6, backoff=t_affix5)

acc_def = t_def.evaluate(test_data) * 100
acc_af2 = t_affix2.evaluate(test_data) * 100
acc_af3 = t_affix3.evaluate(test_data) * 100
acc_af4 = t_affix4.evaluate(test_data) * 100
acc_af5 = t_affix5.evaluate(test_data) * 100
acc_af6 = t_affix6.evaluate(test_data) * 100

print('''Performance dos taggers:
         - Default:                     {:.2f}%
         - Sufixo tamanho 2 + Default:  {:.2f}%
         - Sufixo tamanho 3 + Sufixo 2: {:.2f}%
         - Sufixo tamanho 4 + Sufixo 3: {:.2f}%
         - Sufixo tamanho 5 + Sufixo 4: {:.2f}%
         - Sufixo tamanho 6 + Sufixo 5: {:.2f}%'''.format(acc_def, acc_af2, acc_af3,
                                                          acc_af4, acc_af5, acc_af6))

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  acc_def = t_def.evaluate(test_data) * 100
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  acc_af2 = t_affix2.evaluate(test_data) * 100
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  acc_af3 = t_affix3.evaluate(test_data) * 100
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  acc_af4 = t_affix4.evaluate(test_data) * 100
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  acc_af5 = t_affix5.evaluate(test_data) * 100
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  acc_af6 = t_affix6.evaluate(test_data) * 100


Performance dos taggers:
         - Default:                     19.68%
         - Sufixo tamanho 2 + Default:  27.29%
         - Sufixo tamanho 3 + Sufixo 2: 32.23%
         - Sufixo tamanho 4 + Sufixo 3: 34.66%
         - Sufixo tamanho 5 + Sufixo 4: 36.24%
         - Sufixo tamanho 6 + Sufixo 5: 36.71%


In [24]:
t_uni = nltk.UnigramTagger(train_data, backoff=t_affix5)

acc_uni = t_uni.evaluate(test_data) * 100

print('''Performance dos taggers:
         - Default:                     {:.2f}%
         - Sufixo tamanho 2 + Default:  {:.2f}%
         - Sufixo tamanho 3 + Sufixo 2: {:.2f}%
         - Sufixo tamanho 4 + Sufixo 3: {:.2f}%
         - Sufixo tamanho 5 + Sufixo 4: {:.2f}%
         - Sufixo tamanho 6 + Sufixo 5: {:.2f}%
         - Unigrama + Sufixo 6:         {:.2f}%'''.format(acc_def, acc_af2, acc_af3,
                                                          acc_af4, acc_af5, acc_af6,
                                                          acc_uni))

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  acc_uni = t_uni.evaluate(test_data) * 100


Performance dos taggers:
         - Default:                     19.68%
         - Sufixo tamanho 2 + Default:  27.29%
         - Sufixo tamanho 3 + Sufixo 2: 32.23%
         - Sufixo tamanho 4 + Sufixo 3: 34.66%
         - Sufixo tamanho 5 + Sufixo 4: 36.24%
         - Sufixo tamanho 6 + Sufixo 5: 36.71%
         - Unigrama + Sufixo 6:         83.70%


In [25]:
t_bi = nltk.BigramTagger(train_data, backoff=t_uni)
t_tri = nltk.TrigramTagger(train_data, backoff=t_bi)

acc_bi = t_bi.evaluate(test_data) * 100
acc_tri = t_tri.evaluate(test_data) * 100

print('''Performance dos taggers:
         - Default:                     {:.2f}%
         - Sufixo tamanho 2 + Default:  {:.2f}%
         - Sufixo tamanho 3 + Sufixo 2: {:.2f}%
         - Sufixo tamanho 4 + Sufixo 3: {:.2f}%
         - Sufixo tamanho 5 + Sufixo 4: {:.2f}%
         - Sufixo tamanho 6 + Sufixo 5: {:.2f}%
         - Unigrama + Sufixo 6:         {:.2f}%
         - Bigrama + Unigrama:          {:.2f}%
         - Trigrama + Bigrama:          {:.2f}%'''.format(acc_def, acc_af2, acc_af3,
                                                          acc_af4, acc_af5, acc_af6,
                                                          acc_uni, acc_bi, acc_tri))

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  acc_bi = t_bi.evaluate(test_data) * 100
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  acc_tri = t_tri.evaluate(test_data) * 100


Performance dos taggers:
         - Default:                     19.68%
         - Sufixo tamanho 2 + Default:  27.29%
         - Sufixo tamanho 3 + Sufixo 2: 32.23%
         - Sufixo tamanho 4 + Sufixo 3: 34.66%
         - Sufixo tamanho 5 + Sufixo 4: 36.24%
         - Sufixo tamanho 6 + Sufixo 5: 36.71%
         - Unigrama + Sufixo 6:         83.70%
         - Bigrama + Unigrama:          85.18%
         - Trigrama + Bigrama:          85.19%


In [27]:
import pickle
pickle.dump(t_bi, open('..\Datasets\POS_tagger_brill.pkl', 'wb'))

In [28]:
#---

def viterbi(obs, states, start_p, trans_p, emit_p):
    V = [{}]
    for st in states:
        V[0][st] = {"prob": start_p[st] * emit_p[st][obs[0]], "prev": None}
    for t in range(1, len(obs)):
        V.append({})
        for st in states:
            max_tr_prob = max(V[t-1][prev_st]["prob"]*trans_p[prev_st][st] for prev_st in states)
            for prev_st in states:
                if V[t-1][prev_st]["prob"]*trans_p[prev_st][st] == max_tr_prob:
                    max_prob = max_tr_prob * emit_p[st][obs[t]]
                    V[t][st] = {"prob": max_prob, "prev": prev_st}
                    break
    return V

In [29]:
states = ('Rainy', 'Sunny')

observations = ('walk', 'shop', 'clean')

start_probability = {'Rainy': 0.6, 'Sunny': 0.4}

transition_probability = {
'Rainy' : {'Rainy': 0.7, 'Sunny': 0.3},
'Sunny' : {'Rainy': 0.4, 'Sunny': 0.6},
}

emission_probability = {
'Rainy' : {'walk': 0.1, 'shop': 0.4, 'clean': 0.5},
'Sunny' : {'walk': 0.6, 'shop': 0.3, 'clean': 0.1},
}



In [30]:
viterbi(observations,
         states,
         start_probability,
         transition_probability,
         emission_probability)

[{'Rainy': {'prob': 0.06, 'prev': None},
  'Sunny': {'prob': 0.24, 'prev': None}},
 {'Rainy': {'prob': 0.038400000000000004, 'prev': 'Sunny'},
  'Sunny': {'prob': 0.043199999999999995, 'prev': 'Sunny'}},
 {'Rainy': {'prob': 0.01344, 'prev': 'Rainy'},
  'Sunny': {'prob': 0.0025919999999999997, 'prev': 'Sunny'}}]

In [31]:
#-----------------
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils as ku 
import numpy as np

In [32]:
tokenizer = Tokenizer()
data = open('../input/dtspeech/DTSpeech.txt').read()
corpus = data.lower().split("\n")
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

FileNotFoundError: [Errno 2] No such file or directory: '../input/dtspeech/DTSpeech.txt'