In [1]:
import spacy 

In [2]:
#load model 
model=spacy.load('en_core_web_sm')

In [3]:
# Create a Doc object
doc = model(u'A 12-week intensive training program on Artificial Intelligence and Machine Learning, taught to enthusiasts and prospective data scientist on the startup and corporate level.')

In [4]:
for token in doc :
    print(token.text) #token.pos_ ,pos_token.dep_
    

A
12-week
intensive
training
program
on
Artificial
Intelligence
and
Machine
Learning
,
taught
to
enthusiasts
and
prospective
data
scientist
on
the
startup
and
corporate
level
.


In [61]:
for token in doc:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

A          DET      DT     determiner
12-week    NUM      CD     cardinal number
intensive  ADJ      JJ     adjective
training   NOUN     NN     noun, singular or mass
program    NOUN     NN     noun, singular or mass
on         ADP      IN     conjunction, subordinating or preposition
Artificial PROPN    NNP    noun, proper singular
Intelligence PROPN    NNP    noun, proper singular
and        CCONJ    CC     conjunction, coordinating
Machine    PROPN    NNP    noun, proper singular
Learning   PROPN    NNP    noun, proper singular
,          PUNCT    ,      punctuation mark, comma
taught     VERB     VBD    verb, past tense
to         ADP      IN     conjunction, subordinating or preposition
enthusiasts NOUN     NNS    noun, plural
and        CCONJ    CC     conjunction, coordinating
prospective ADJ      JJ     adjective
data       NOUN     NN     noun, singular or mass
scientist  NOUN     NN     noun, singular or mass
on         ADP      IN     conjunction, subordinating or prepositi

In [5]:
for sent in doc.sents:
    print(sent)

A 12-week intensive training program on Artificial Intelligence and Machine Learning, taught to enthusiasts and prospective data scientist on the startup and corporate level.


In [6]:
model.pipeline 

[('tagger', <spacy.pipeline.pipes.Tagger at 0x15e66eb0988>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x15e66d9e348>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x15e66d9e408>)]

In [7]:
model.pipe_names

['tagger', 'parser', 'ner']

In [9]:
doc2=model(u'Alongside the course work, the cohort is presented with realtime case studies that help them better explore the applicability of the curriculum.')

In [10]:
for token in doc2 :
    print(token.text) #token.pos_ ,pos_token.dep_

Alongside
the
course
work
,
the
cohort
is
presented
with
realtime
case
studies
that
help
them
better
explore
the
applicability
of
the
curriculum
.


In [11]:
doc2[0]

Alongside

In [12]:
doc2[0].pos_

'ADP'

In [17]:
doc3 = model(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [18]:
life_quote = doc3[16:30]
print(life_quote)

"Life is what happens to us while we are making other plans"


In [19]:
type(life_quote)

spacy.tokens.span.Span

In [20]:
for sent in doc3.sents:
    print(sent)

Although commmonly attributed to John Lennon from his song "Beautiful Boy", the phrase "Life is what happens to us while we are making other plans" was written by cartoonist Allen Saunders and published in Reader's Digest in 1957, when Lennon was 17.


# Tokenization 

In [21]:
sent1 = '"We\'re moving to L.A.!"'
print(sent1 )

"We're moving to L.A.!"


In [22]:
sent1 =model(sent1 ) 

In [23]:
for token in sent1 :
    print(token.text) #token.pos_ ,pos_token.dep_
    

"
We
're
moving
to
L.A.
!
"


In [42]:
doc2 = model(u"We're here to help! Send your-mail, email addmision@africadataschol.com or visit us at http://www.africadataschool.com! Usd 350")

for t in doc2:
    print(t)

We
're
here
to
help
!
Send
your
-
mail
,
email
addmision@africadataschol.com
or
visit
us
at
http://www.africadataschool.com
!
Usd
350


In [25]:
len(doc2)

24

In [26]:
doc2[2:5]

course work,

# Named Entities

In [30]:
doc7 = model(u'Africa Data School is located in Nairobi')

for token in doc7:
    print(token.text, end=' | ')

print('\n----')



Africa | Data | School | is | located | in | Nairobi | 
----


In [31]:
for ent in doc7.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

Africa Data School - ORG - Companies, agencies, institutions, etc.
Nairobi - GPE - Countries, cities, states


# Visualizers

In [32]:
from spacy import displacy

displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})

In [33]:
displacy.render(doc7, style='ent', jupyter=True)

In [62]:
doc = model(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, Sony sold only 7 thousand Walkman music players.')

displacy.render(doc, style='ent', jupyter=True)

# Steamming 

In [35]:

import nltk

from nltk.stem.porter import *

In [36]:
p_stemmer = PorterStemmer()

In [37]:
words = ['run','runner','running','ran','runs','easily','fairly']

In [38]:
for word in words:
    print(word+' --> '+p_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli


In [39]:
from nltk.stem.snowball import SnowballStemmer

# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')

In [40]:
words = ['run','runner','running','ran','runs','easily','fairly']
# words = ['generous','generation','generously','generate']

In [41]:
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fair


Stemming has its drawbacks. If given the token saw, stemming might always return saw, whereas lemmatization would likely return either see or saw depending on whether the use of the token was as a verb or a noun. As an example, consider the following:

# Lemmatization 

In [44]:
doc1 = model(u"I am a runner running in a race because I love to run since I ran today")

for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [46]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [48]:
doc2 = model(u"I saw eighteen mice today!")

show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


# Stop words 

In [51]:
print(model.Defaults.stop_words)

{'wherever', 'herself', 'either', 'himself', 'been', 'such', 'up', 'whither', 'further', 'be', 'this', 'him', 'therefore', 'own', 'beside', 'was', 'yourselves', 'every', 'anyone', 'who', 'ever', 'first', 'often', 'during', 'go', 'thereupon', 'before', 'serious', 'when', 'eight', 'along', 'thence', 'say', 'see', 'n’t', 'if', 'must', 'make', '‘s', 'these', 'doing', 'herein', 'side', 'enough', 'become', 'that', 'same', 'have', 'another', 'about', 'are', 'upon', 'into', 'rather', 'thus', '’re', 'throughout', 'whole', 'something', 'ca', 'twenty', 'which', 'more', 'also', 'four', '’d', 'behind', 'he', 'anyhow', 'due', '‘re', 'is', 'among', 'keep', 'hence', '’m', 'whereby', 'became', 'five', 'third', 'latterly', 'above', 'their', 'somehow', 'else', 'whereupon', 'thereby', 'not', 'against', 'around', 'via', 'did', 'sometimes', 'am', 'otherwise', 'empty', 'why', 'least', 'us', 'wherein', 'someone', 'becoming', 'next', 'various', 'each', 'ours', 'just', 'to', 'show', 'six', 'there', 'so', 'using

In [52]:
len(model.Defaults.stop_words)

326

In [54]:
# check  if is a stop  word 
model.vocab['myself'].is_stop

True

In [56]:
# add new stop word 
# Add the word to the set of stop words. Use lowercase!
model.Defaults.stop_words.add('niajewasee')

# Set the stop_word tag on the lexeme
model.vocab['niajewasee'].is_stop = True

In [60]:
len(model.Defaults.stop_words)

326

In [58]:
# Remove the word from the set of stop words
model.Defaults.stop_words.remove('beyond')

# Remove the stop_word tag from the lexeme
model.vocab['beyond'].is_stop = False

In [59]:
nlp.vocab['beyond'].is_stop

False

# Practical exercise ( Text spam )

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('data/smsspamcollection.tsv', sep='\t')
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [3]:
from sklearn.model_selection import train_test_split

X = df['message']  # this time we want to look at the text
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [4]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: message, Length: 5572, dtype: object

In [5]:
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object

### Scikit-learn's CountVectorizer
* Text preprocessing, tokenizing and the ability to filter out stopwords are all included in CountVectorizer, which builds a dictionary of features and transforms documents to feature vectors.

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(3733, 7082)

In [7]:
X_train_counts

<3733x7082 sparse matrix of type '<class 'numpy.int64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [9]:
#Downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus.

# This downscaling is called **tf–idf** for “Term Frequency times Inverse Document Frequency”.

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(3733, 7082)

In [10]:
X_train_tfidf

<3733x7082 sparse matrix of type '<class 'numpy.float64'>'
	with 49992 stored elements in Compressed Sparse Row format>

### TfidVectorizer

* Combine the CountVectorizer and TfidTransformer steps into one using TfidVectorizer:

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train) # remember to use the original X_train set
X_train_tfidf.shape

(3733, 7082)

# Modelling 

In [12]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)

LinearSVC()

# Pipeline

In [13]:
from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [14]:
# Form a prediction set
predictions = text_clf.predict(X_test)

In [15]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[1586    7]
 [  12  234]]


In [16]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

    accuracy                           0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [17]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.989668297988037
