# Udemy: NLP - Natural Language Processing with Python (Notes)

https://www.udemy.com/course/nlp-natural-language-processing-with-python/

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
import os
data_folder = os.path.abspath(os.getcwd()).replace("\\", "/") + "/"

#### INTRODUCTION

### 1) PYTHON TEXT BASICS

#### 1.1) Working with Text Files

In [3]:
data_folder_0 = data_folder + '00-Python-Text-Basics'

In [4]:
name = 'Fred'

# Using the old .format() method:
print('His name is {}.'.format(name))

# Using f-strings:
print(f"His name is {name}.")

His name is Fred.
His name is Fred.


In [5]:
%%writefile test.txt
Hello, this is a quick test file.
This is the second line of the file.

Overwriting test.txt


In [6]:
library = [('Author', 'Topic', 'Pages'), ('Twain', 'Rafting', 601), ('Feynman', 'Physics', 95), ('Hamilton', 'Mythology', 144)]

for book in library:
    print(f'{book[0]:{10}} {book[1]:{8}} {book[2]:{7}}')

Author     Topic    Pages  
Twain      Rafting      601
Feynman    Physics       95
Hamilton   Mythology     144


In [7]:
len(('Twain', 'Rafting', 601)[1])
len(str(('Twain', 'Rafting', 601)[2]))

3

In [8]:
max([len(str(i[2])) for i in library])

5

In [9]:
%%script false
max0 = max([len(i[0]) for i in library])
max1 = max([len(i[1]) for i in library])
max2 = max([len(str(i[2])) for i in library])

for book in library:
    print(f"{book[0]:{max0} {book[1]:{max1} {book[2]:{max2}")

Couldn't find program: 'false'


In [10]:
for book in library:
    print(book[0])
    
('Author', 'Topic', 'Pages')[1]

Author
Twain
Feynman
Hamilton


'Topic'

In [11]:
myfile = open('test.txt')
myfile

<_io.TextIOWrapper name='test.txt' mode='r' encoding='cp1252'>

In [12]:
myfile.read()

'Hello, this is a quick test file.\nThis is the second line of the file.\n'

In [13]:
myfile.read()

''

In [14]:
open('test.txt').read()

'Hello, this is a quick test file.\nThis is the second line of the file.\n'

#### 1.2) Working with PDF files

In [15]:
import PyPDF2

#### 1.3) Regular Expressions

In [16]:
import re

https://docs.python.org/3/howto/regex.html

### 2) NATURAL LANGUAGE PROCESSING BASICS

In [17]:
data_folder_1 = data_folder + '01-NLP-Python-Basics'

#### 2.1) Spacy basics

In [18]:
# pt_core_news_sm
import spacy
nlp = spacy.load('pt_core_news_sm')

# Create a Doc object
doc = nlp("""Na minha atividade actividade profissional eu, Tiago, consegui aprofundar melhor os meus conhecimentos em 
Python, R e gestão de equipas\n""")

# Print each token separately
for token in doc:
#     print(token.text, token.pos_, token.dep_)
    print(f'{token.text:{10}} {token.pos_:{10}} {token.dep_:{10}}')

Na         PROPN      nsubj     
minha      DET        det       
atividade  NOUN       nmod:npmod
actividade ADJ        flat:name 
profissional ADJ        amod      
eu         PRON       flat:name 
,          PUNCT      punct     
Tiago      PROPN      appos     
,          PUNCT      punct     
consegui   VERB       ROOT      
aprofundar VERB       xcomp     
melhor     ADV        advmod    
os         DET        det       
meus       DET        det       
conhecimentos SYM        obj       
em         ADP        case      

          SPACE                
Python     PROPN      nmod      
,          PUNCT      punct     
R          PROPN      ROOT      
e          CCONJ      cc        
gestão     NOUN       conj      
de         ADP        case      
equipas    NOUN       nmod      

          SPACE                


In [19]:
# Import spaCy and load the language library
import spacy
nlp = spacy.load('en_core_web_sm')

# Create a Doc object
doc = nlp(u'Tesla is looking at buying the wonderful U.S. startup Ikari for $6 million')

# Print each token separately
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
the DET det
wonderful ADJ amod
U.S. PROPN compound
startup NOUN compound
Ikari PROPN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


#### 2.2) Tokenization

In [20]:
from spacy import displacy

doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 100})
displacy.render(doc, style='ent', jupyter=True)

#### 2.3) Stemming

In [21]:
# Import the toolkit and the full Porter Stemmer library
import nltk

from nltk.stem.porter import *

In [22]:
p_stemmer = PorterStemmer()

words = ['run','runner','running','ran','runs','easily','fairly']
for word in words:
    print(word+' --> '+p_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli


In [23]:
from nltk.stem.snowball import SnowballStemmer  # the 2nd Porter stemmer - more sofisticated

# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')

#### 2.4) Lemmatization

In [24]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [25]:
doc2 = nlp(u"I saw eighteen mice today!. Yeah.")

show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !
.            PUNCT  12646065887601541794   .
Yeah         INTJ   11852442279192850303   yeah
.            PUNCT  12646065887601541794   .


In [26]:
list(doc2.sents)

[I saw eighteen mice today!., Yeah.]

#### 2.5) Stop Words

In [27]:
len(nlp.Defaults.stop_words)

326

#### 2.6) Vocabulary and matching

In [28]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [29]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

#### PART OF SPEECH TAGGING (Pos Tag) & NAMED ENTITY RECOGNITION (NER)

In [30]:
data_folder_2 = data_folder + '02-Parts-of-Speech-Tagging'

In [31]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

# Count the frequencies of different coarse-grained POS tags:
POS_counts = doc.count_by(spacy.attrs.POS)
POS_counts

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1, 97: 1}

In [32]:
sorted_values = sorted(POS_counts.items(), key=lambda x: x[1], reverse=True)
print(sorted_values, "\n")

for pos, occ in sorted_values:
    print(f'{pos}. {doc.vocab[pos].text:{5}}: {occ}')

[(84, 3), (92, 3), (90, 2), (100, 1), (85, 1), (94, 1), (97, 1)] 

84. ADJ  : 3
92. NOUN : 3
90. DET  : 2
100. VERB : 1
85. ADP  : 1
94. PART : 1
97. PUNCT: 1


In [33]:
doc = nlp("""Can I please borrow 500 dollars from you to buy some Microsoft stock? I'll give it back in 250 hours.
More than a day""")

for ent in doc.ents:
    print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

500 dollars 4 6 20 31 MONEY
Microsoft 11 12 53 62 ORG
250 hours 20 22 91 100 TIME
More than a day 24 28 102 117 DATE


In [34]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [35]:
from spacy.pipeline import SentenceSegmenter

#### TEXT CLASSIFICATION

In [36]:
data_folder_3 = data_folder + '03-Text-Classification'

In [37]:
# tf-idf...
for i in range(1, 10):
    print(np.log(i))

0.0
0.6931471805599453
1.0986122886681098
1.3862943611198906
1.6094379124341003
1.791759469228055
1.9459101490553132
2.0794415416798357
2.1972245773362196


In [38]:
print(np.log(500))
print(np.log(1000))

6.214608098422191
6.907755278982137


In [39]:
phrase = "that ain't good for ya"

print(phrase.split())
print([t for t in nlp(phrase)])

['that', "ain't", 'good', 'for', 'ya']
[that, ai, n't, good, for, ya]


#### SEMANTICS AND SENTIMENT ANALYSIS

In [40]:
data_folder_4 = data_folder + '04-Semantics-and-Sentiment-Analysis'

In [41]:
# softmax
import numpy as np
a = [1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0]
print(np.exp(a))
print(np.sum(np.exp(a)) )

np.exp(a) / np.sum(np.exp(a)) 

[ 2.71828183  7.3890561  20.08553692 54.59815003  2.71828183  7.3890561
 20.08553692]
114.98389973429897


array([0.02364054, 0.06426166, 0.1746813 , 0.474833  , 0.02364054,
       0.06426166, 0.1746813 ])

Up to now we've been using spaCy's smallest English language model, ***en_core_web_sm (35MB)***, which provides vocabulary, syntax, and entities, but not vectors. To take advantage of built-in word vectors we'll need a larger library. We have a few options:

- ***en_core_web_md (116MB) Vectors***: 685k keys, 20k unique vectors (300 dimensions)
or
- ***en_core_web_lg (812MB) Vectors***: 685k keys, 685k unique vectors (300 dimensions)

<br>
If you plan to rely heavily on word vectors, consider using spaCy's largest vector library containing over one million unique vectors:

- **en_vectors_web_lg (631MB) Vectors**: 1.1m keys, 1.1m unique vectors (300 dimensions)

In [42]:
%%time
nlp = spacy.load('en_core_web_md')

Wall time: 34.5 s


In [43]:
print(nlp('what').vector.shape)
print(nlp('what').vector[:10])
print("\n")
print(nlp('fox').vector.shape)
print(nlp('fox').vector[:10])

(300,)
[-0.038548  0.54252  -0.21843  -0.18855   0.073     0.1318   -0.10402
  0.17231  -0.051587  2.8646  ]


(300,)
[-0.34868  -0.07772   0.17775  -0.094953 -0.45289   0.23779   0.20944
  0.037886  0.035064  0.89901 ]


In [44]:
print(nlp('what fox').vector.shape)
print(nlp('what fox').vector[:10])

(300,)
[-0.19361399  0.2324     -0.02034    -0.1417515  -0.18994501  0.18479499
  0.05271     0.10509799 -0.0082615   1.881805  ]


In [45]:
print(np.mean([nlp('what').vector[:10], nlp('fox').vector[:10]], axis=0))

[-0.19361399  0.2324     -0.02034    -0.1417515  -0.18994501  0.18479499
  0.05271     0.10509799 -0.0082615   1.881805  ]


In [46]:
# é a média entre os tokens
np.mean([nlp('what').vector[:10], nlp('fox').vector[:10]], axis=0) == nlp('what fox').vector[:10]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [47]:
print(nlp('lion').similarity(nlp('tiger')), "\n")


# like - hate - depends on context. change
print(nlp('I like pizza.').similarity(nlp('I hate pizza.')))
print("""tirado do DataCamp .- curso com a criadora do spacy 
There's no objective definition of similarity. Depends on the context and on what the application needs to do.

Once you're getting serious about developing NLP applications that leverage semantic similarity, you might want
to train vectors on your own data, or tweak the similarity algorithm.""")

0.7359829457249657 

0.9622129730267112
tirado do DataCamp .- curso com a criadora do spacy 
There's no objective definition of similarity. Depends on the context and on what the application needs to do.

Once you're getting serious about developing NLP applications that leverage semantic similarity, you might want
to train vectors on your own data, or tweak the similarity algorithm.


In [48]:
# aggregate the 300 dimension vectors into a Euclidian (L2) norm - square root of the sum-of-squared-vectors
print(nlp('lion').vector_norm)
print(sum(nlp('lion').vector**2)**0.5)
# np.linalg.norm(nlp('lion').vector)  # igual

6.512089421494053
6.512089421494053


In [49]:
# king - man + woman = queen
kmw = nlp.vocab['king'].vector - nlp.vocab['man'].vector + nlp.vocab['woman'].vector
queen = nlp.vocab['queen'].vector


from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([kmw], [queen])

array([[0.78808445]], dtype=float32)

In [50]:
%%time

from sklearn.metrics.pairwise import cosine_similarity

king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

# Now we find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
new_vector = king - man + woman
computed_similarities = []

for word in nlp.vocab:
    # Ignore words without vectors and mixed-case words:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity([new_vector], [word.vector])
                computed_similarities.append((word, similarity))

computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])

print([w[0].text for w in computed_similarities[:10]])

['king', 'queen', 'commoner', 'highness', 'prince', 'sultan', 'maharajas', 'princes', 'kumbia', 'kings']
Wall time: 2min 13s


In [51]:
# passar o vetor para uma palavra (inventada?)

In [52]:
%%time

from sklearn.metrics.pairwise import cosine_similarity

new_vector = nlp.vocab['Python'].vector

# Now we find the closest vector in the vocabulary 
computed_similarities = []

for word in nlp.vocab:
    # Ignore words without vectors and mixed-case words:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity([new_vector], [word.vector])
                computed_similarities.append((word, similarity))

computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])

print([w[0].text for w in computed_similarities[:10]])

['distutils', 'python', 'tkinter', 'nltk', 'numpy', 'scipy', 'capybara', 'reticulated', 'pygtk', 'argparse']
Wall time: 2min 10s


In [53]:
# Sentiment Analysis

In [54]:
#VADER - compound?
a = {'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

#### TOPIC MODELING

In [55]:
data_folder_5 = data_folder + '05-Topic-Modeling'

#### DEEP LEARNING FOR NLP

In [56]:
data_folder_6 = data_folder + '06-Deep-Learning'

In [57]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

In [58]:
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [59]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [60]:
from keras.utils import to_categorical
y_cat = to_categorical(y)
y_cat[:5]

Using TensorFlow backend.


array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]], dtype=float32)

In [61]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_cat_train, y_cat_test = train_test_split(X, y_cat, test_size=0.33, random_state=42)

In [62]:
# Usually when using Neural Networks, you will get better performance when you standardize the data. 

from sklearn.preprocessing import MinMaxScaler

scaler_object = MinMaxScaler()

scaler_object.fit(X_train)  # fit apenas ao X_train por motivos de information leak
scaled_X_train = scaler_object.transform(X_train)
scaled_X_test = scaler_object.transform(X_test)  

In [63]:
scaled_X_train[:5]

array([[0.41176471, 0.40909091, 0.55357143, 0.5       ],
       [0.97058824, 0.45454545, 0.98214286, 0.83333333],
       [0.38235294, 0.45454545, 0.60714286, 0.58333333],
       [0.23529412, 0.68181818, 0.05357143, 0.04166667],
       [1.        , 0.36363636, 1.        , 0.79166667]])

In [64]:
scaled_X_test[:5]

array([[0.52941176, 0.36363636, 0.64285714, 0.45833333],
       [0.41176471, 0.81818182, 0.10714286, 0.08333333],
       [1.        , 0.27272727, 1.03571429, 0.91666667],
       [0.5       , 0.40909091, 0.60714286, 0.58333333],
       [0.73529412, 0.36363636, 0.66071429, 0.54166667]])

In [65]:
print(scaled_X_train.max())
print(scaled_X_test.max()) # mas tem que ser por motivos de information leak

1.0
1.0909090909090908


##### Building the Network with Keras

In [66]:
from keras.models import Sequential
from keras.layers import Dense

In [67]:
model = Sequential()
model.add(Dense(8, input_dim=4, activation='relu'))
model.add(Dense(8, input_dim=4, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [68]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 8)                 40        
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 27        
Total params: 139
Trainable params: 139
Non-trainable params: 0
_________________________________________________________________


**Fit (Train) the Model**

In [69]:
# Play around with number of epochs as well
model.fit(scaled_X_train, y_cat_train, epochs=200, verbose=2)

Epoch 1/200
 - 0s - loss: 1.1063 - accuracy: 0.2100
Epoch 2/200
 - 0s - loss: 1.1029 - accuracy: 0.3700
Epoch 3/200
 - 0s - loss: 1.0996 - accuracy: 0.3800
Epoch 4/200
 - 0s - loss: 1.0967 - accuracy: 0.4500
Epoch 5/200
 - 0s - loss: 1.0942 - accuracy: 0.5100
Epoch 6/200
 - 0s - loss: 1.0920 - accuracy: 0.5500
Epoch 7/200
 - 0s - loss: 1.0901 - accuracy: 0.5600
Epoch 8/200
 - 0s - loss: 1.0883 - accuracy: 0.5800
Epoch 9/200
 - 0s - loss: 1.0865 - accuracy: 0.5800
Epoch 10/200
 - 0s - loss: 1.0848 - accuracy: 0.5800
Epoch 11/200
 - 0s - loss: 1.0831 - accuracy: 0.5800
Epoch 12/200
 - 0s - loss: 1.0813 - accuracy: 0.5700
Epoch 13/200
 - 0s - loss: 1.0795 - accuracy: 0.5900
Epoch 14/200
 - 0s - loss: 1.0776 - accuracy: 0.6000
Epoch 15/200
 - 0s - loss: 1.0758 - accuracy: 0.6800
Epoch 16/200
 - 0s - loss: 1.0737 - accuracy: 0.6000
Epoch 17/200
 - 0s - loss: 1.0716 - accuracy: 0.6400
Epoch 18/200
 - 0s - loss: 1.0693 - accuracy: 0.6700
Epoch 19/200
 - 0s - loss: 1.0671 - accuracy: 0.6100
Ep

Epoch 155/200
 - 0s - loss: 0.4614 - accuracy: 0.8900
Epoch 156/200
 - 0s - loss: 0.4589 - accuracy: 0.8900
Epoch 157/200
 - 0s - loss: 0.4568 - accuracy: 0.8700
Epoch 158/200
 - 0s - loss: 0.4541 - accuracy: 0.8600
Epoch 159/200
 - 0s - loss: 0.4525 - accuracy: 0.8500
Epoch 160/200
 - 0s - loss: 0.4500 - accuracy: 0.8500
Epoch 161/200
 - 0s - loss: 0.4477 - accuracy: 0.8500
Epoch 162/200
 - 0s - loss: 0.4453 - accuracy: 0.8500
Epoch 163/200
 - 0s - loss: 0.4420 - accuracy: 0.8500
Epoch 164/200
 - 0s - loss: 0.4386 - accuracy: 0.8500
Epoch 165/200
 - 0s - loss: 0.4351 - accuracy: 0.8900
Epoch 166/200
 - 0s - loss: 0.4316 - accuracy: 0.8900
Epoch 167/200
 - 0s - loss: 0.4288 - accuracy: 0.8900
Epoch 168/200
 - 0s - loss: 0.4258 - accuracy: 0.8900
Epoch 169/200
 - 0s - loss: 0.4230 - accuracy: 0.8900
Epoch 170/200
 - 0s - loss: 0.4206 - accuracy: 0.8900
Epoch 171/200
 - 0s - loss: 0.4174 - accuracy: 0.9100
Epoch 172/200
 - 0s - loss: 0.4151 - accuracy: 0.9100
Epoch 173/200
 - 0s - loss: 

<keras.callbacks.callbacks.History at 0x1ad49a96a48>

**Predicting New Unseen Data**

Let's see how we did by predicting on new data. Remember, our model has never seen the test data that we scaled previously! This process is the exact same process you would use on totally brand new data.

In [70]:
scaled_X_test[:5]

array([[0.52941176, 0.36363636, 0.64285714, 0.45833333],
       [0.41176471, 0.81818182, 0.10714286, 0.08333333],
       [1.        , 0.27272727, 1.03571429, 0.91666667],
       [0.5       , 0.40909091, 0.60714286, 0.58333333],
       [0.73529412, 0.36363636, 0.66071429, 0.54166667]])

In [71]:
# Spits out probabilities by default.
model.predict(scaled_X_test)

array([[5.06768227e-02, 7.46325672e-01, 2.02997476e-01],
       [9.92947698e-01, 6.91933557e-03, 1.32957008e-04],
       [3.48246694e-02, 3.63085717e-01, 6.02089584e-01],
       [4.66439389e-02, 6.48943543e-01, 3.04412544e-01],
       [4.70929332e-02, 6.58475935e-01, 2.94431120e-01],
       [9.81330454e-01, 1.80526637e-02, 6.16847188e-04],
       [5.68658002e-02, 7.61135638e-01, 1.81998596e-01],
       [3.48246694e-02, 3.63085717e-01, 6.02089584e-01],
       [3.48592885e-02, 3.77847761e-01, 5.87292969e-01],
       [5.06429709e-02, 7.45329261e-01, 2.04027697e-01],
       [3.73332947e-02, 4.59844649e-01, 5.02822101e-01],
       [9.77085352e-01, 2.22442020e-02, 6.70461624e-04],
       [9.91768479e-01, 8.05437006e-03, 1.77095280e-04],
       [9.80609000e-01, 1.88728124e-02, 5.18248824e-04],
       [9.93060827e-01, 6.82041794e-03, 1.18782533e-04],
       [4.99991253e-02, 7.27292776e-01, 2.22708121e-01],
       [3.48246694e-02, 3.63085717e-01, 6.02089584e-01],
       [5.00011072e-02, 7.27346

In [72]:
# prints classes
model.predict_classes(scaled_X_test)

array([1, 0, 2, 1, 1, 0, 1, 2, 2, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 1,
       0, 2, 1, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0, 1, 1, 1, 1, 2], dtype=int64)

In [73]:
len(model.predict(scaled_X_test)) == len(model.predict_classes(scaled_X_test)) == 50

True

**Evaluating Model Performance**

So how well did we do? How do we actually measure "well". Is 95% accuracy good enough? It all depends on the situation. Also we need to take into account things like recall and precision. 

In [74]:
model.metrics_names

['loss', 'accuracy']

In [75]:
model.evaluate(x=scaled_X_test, y=y_cat_test)



[0.3144051456451416, 0.8999999761581421]

In [76]:
from sklearn.metrics import confusion_matrix, classification_report

In [77]:
predictions = model.predict_classes(scaled_X_test)
predictions

array([1, 0, 2, 1, 1, 0, 1, 2, 2, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 1,
       0, 2, 1, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0, 1, 1, 1, 1, 2], dtype=int64)

In [78]:
display(y_cat_test[:5])
y_cat_test.argmax(axis=1)  # para comparar com o mesmo formato das predictions

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0, 1, 2, 2, 1, 2], dtype=int64)

In [79]:
# confusion matrix
confusion_matrix(y_cat_test.argmax(axis=1), predictions)

array([[19,  0,  0],
       [ 0, 14,  1],
       [ 0,  4, 12]], dtype=int64)

In [80]:
print(classification_report(y_cat_test.argmax(axis=1),predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.78      0.93      0.85        15
           2       0.92      0.75      0.83        16

    accuracy                           0.90        50
   macro avg       0.90      0.89      0.89        50
weighted avg       0.91      0.90      0.90        50



**Saving and Loading Models**

Now that we have a model trained, let's see how we can save and load it.

In [81]:
model.save('myfirstmodel.h5')

In [82]:
from keras.models import load_model

newmodel = load_model('myfirstmodel.h5')

newmodel.predict_classes(scaled_X_test)

array([1, 0, 2, 1, 1, 0, 1, 2, 2, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 1,
       0, 2, 1, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0, 1, 1, 1, 1, 2], dtype=int64)

#### Text Generation

In [83]:
def read_file(filepath):    
    with open(filepath) as f:
        str_text = f.read()    
    return str_text

##############################################################

import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger','ner'])
nlp.max_length = 1198623

##############################################################

def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in 
            '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

d = read_file(data_folder_6 + '/melville-moby_dick.txt')
tokens = separate_punc(d)

##############################################################

# organize into sequences of tokens
train_len = 25+1 # 50 training words , then one target word

# Empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)):    
    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i]    
    # Add to list of sequences
    text_sequences.append(seq)


In [84]:
# moby dick trained model (>3hours) - usar o já existente

from keras.models import load_model
from pickle import load

model = load_model(data_folder_6 + '/epochBIG.h5')

tokenizer = load(open(data_folder_6 + '/epochBIG', 'rb'))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [85]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate (50 words in the video)
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

In [86]:
import random
random.seed(101)
random_pick = random.randint(0, len(text_sequences))
random_seed_text = text_sequences[random_pick]
seed_text = ' '.join(random_seed_text)
seed_text

'stranger that stubb vowed he recognised his cutting spade pole entangled in the lines that were knotted round the tail of one of these whales there'

In [87]:
# definido previamente (ver outro ipynb)
seq_len = 25

generate_text(model, tokenizer, seq_len, seed_text=seed_text, num_gen_words=50)

"i thought i did boys tail men wondrous madness he does very laid oh ye fly me much with a greatest brother of jackasses out the latter 's castaways let very fast when well i 'd go a canakin of our walls and what and personally get to go a"