In [1]:
#importing spacy analysis library
import spacy
#loading the languace dictionary
nlp = spacy.load('en_core_web_sm')

In [2]:
len(nlp.vocab)

478

In [3]:
#assign the word in the vocabulary to an array so that we can use that
words = [t.text for t in nlp.vocab]
print(words[::10])
len(words)

['nuthin', 'N.C.', 'it', 'is', ':-*', 'Ma’am', 'v_v', ":'(", 'Rev.', 'q.', '’’', '<333', 'v.v', "'s", 'somethin', ':-))', '10', '(._.)', "nuthin'", 'Ai', 'Tenn.', ':]', 'Ariz.', 'Ind.', ';-D', 'Conn.', 'Did', "Nothin'", 'was', 'ol', ':’)', 'May', "'ve", 'ಠ_ಠ', 'ä.', 'y.', 'Nebr.', ';)', '’ll', 'goin’', '-_-', 'Sen.', 'O_o', 'r.', 's.', ':-]', "Somethin'", 'Ol']


478

In [4]:
#inserting new element into the vocabulary
doc = nlp("\"Let's go to N.Y!\"")
#if we now look at the length of the vocabulary we will see a growth
len(nlp.vocab)

483

In [5]:
#We can now see what are the new words doing so
words2 = [t.text for t in nlp.vocab]
set(words2)-set(words)

{'!', '"', 'N.Y', 'go', 'to'}

# Iterate over tokenizate elements

In [6]:
for t in doc:
    print(t)

    
#or we can do so by obtaining a list
tokens = list(doc)
print(tokens)

#the doc object can also be indexed i.e doc[5] returns the 5th token
doc[5]
#this is not a string but a Token object, we can get the string by .text
doc[5].text

#we can also use slicing [i:j] to get to the ith element to the jth 
doc[2:4]

#we can not reassign tokens

"
Let
's
go
to
N.Y
!
"
[", Let, 's, go, to, N.Y, !, "]


's go

# Domanda 1

In [7]:
s = "\"Let's go to N.Y!\""
s.split(' ')

['"Let\'s', 'go', 'to', 'N.Y!"']

it is very useful since we cannot divide the verbs if the form is shorted, whereas using spacy we can distinguish the meaning

# Stemming

In [8]:
import nltk
from nltk.stem.porter import *

p_stemmer = PorterStemmer()

words = ['go','goes','went','wish','wishes','wished','runner','ran','running', 'did', 'does']

for w in words:
    print("{} -> {}".format(w, p_stemmer.stem(w)))

go -> go
goes -> goe
went -> went
wish -> wish
wishes -> wish
wished -> wish
runner -> runner
ran -> ran
running -> run
did -> did
does -> doe


# Domanda 2

Stemming doesn't always work because there are some irregular forms wich does not follow grammar rules.

# Lemmatization

In [9]:
st = nlp("I will meet you in the meeting after the meeting after meeting the runner that will run away because his mother ran when he was born and did that so.")
for t in st:
    print("{} -> {}".format(t.text,t.lemma_))

I -> -PRON-
will -> will
meet -> meet
you -> -PRON-
in -> in
the -> the
meeting -> meeting
after -> after
the -> the
meeting -> meeting
after -> after
meeting -> meet
the -> the
runner -> runner
that -> that
will -> will
run -> run
away -> away
because -> because
his -> -PRON-
mother -> mother
ran -> run
when -> when
he -> -PRON-
was -> be
born -> bear
and -> and
did -> do
that -> that
so -> so
. -> .


# Domanda 3
As we can see the lemmatizer is thousand times better than the stemmatizer since it does a sort of analysis and is able to get even the irregular forms

# Stop words

In [10]:
#we can access the stop words of a vocabulary as follows
print(list(nlp.Defaults.stop_words)[:10])

#we can check if a word is a stop word
"the" in nlp.Defaults.stop_words

#to make things faster we can use this sintax
for t in st:
    print("{} -> {}".format(t, t.is_stop))

['when', 'otherwise', 'hers', 'would', 'back', 'call', 'against', 'how', 'towards', 'others']
I -> True
will -> True
meet -> False
you -> True
in -> True
the -> True
meeting -> False
after -> True
the -> True
meeting -> False
after -> True
meeting -> False
the -> True
runner -> False
that -> True
will -> True
run -> False
away -> False
because -> True
his -> True
mother -> False
ran -> False
when -> True
he -> True
was -> True
born -> False
and -> True
did -> True
that -> True
so -> True
. -> False


# Remove stop words

In [11]:
#In order to remove stop words (if we need to) we can use
nlp.Defaults.stop_words.remove('go')

#Also we can add stop words as follows
nlp.Defaults.stop_words.add('!')

# Domanda 4

We want to get rid of very frequent words because, as information theory says, they have very low content of information and thus are not very useful to us

# Distinguish Part Of Speech

In [12]:
for t in st:
    print("{} -> {}".format(t, t.pos_))

I -> PRON
will -> VERB
meet -> VERB
you -> PRON
in -> ADP
the -> DET
meeting -> NOUN
after -> ADP
the -> DET
meeting -> NOUN
after -> ADP
meeting -> VERB
the -> DET
runner -> NOUN
that -> DET
will -> VERB
run -> VERB
away -> ADV
because -> SCONJ
his -> DET
mother -> NOUN
ran -> VERB
when -> ADV
he -> PRON
was -> AUX
born -> VERB
and -> CCONJ
did -> AUX
that -> DET
so -> ADV
. -> PUNCT


In [13]:
#we can obtain more information as
for t in st:
    print("{} -> {}".format(t.text, spacy.explain(t.tag_)))

I -> pronoun, personal
will -> verb, modal auxiliary
meet -> verb, base form
you -> pronoun, personal
in -> conjunction, subordinating or preposition
the -> determiner
meeting -> noun, singular or mass
after -> conjunction, subordinating or preposition
the -> determiner
meeting -> noun, singular or mass
after -> conjunction, subordinating or preposition
meeting -> verb, gerund or present participle
the -> determiner
runner -> noun, singular or mass
that -> wh-determiner
will -> verb, modal auxiliary
run -> verb, base form
away -> adverb
because -> conjunction, subordinating or preposition
his -> pronoun, possessive
mother -> noun, singular or mass
ran -> verb, past tense
when -> wh-adverb
he -> pronoun, personal
was -> verb, past tense
born -> verb, past participle
and -> conjunction, coordinating
did -> verb, past tense
that -> determiner
so -> adverb
. -> punctuation mark, sentence closer


# Domanda 5

The difference is that one is faster to read and less informative, whereas the other is slower but more informative. An useful application for the coarse grained tags is a grammatical analysis program.

# Named Entity Recognition

It allows us to recognize a company, character and so on

In [14]:
cp = nlp("Silvio Berlusconi is to offer EU leaders a historic grand bargain on Italy and Brexit- he is the owner of Amazon.com")
cp. ents

(Silvio Berlusconi, EU, Italy, Amazon.com)

In [15]:
for t in cp.ents:
    print("{} -> {} - {} - {}".format(t, t.text, t.label_, spacy.explain(t.label_)))

Silvio Berlusconi -> Silvio Berlusconi - PERSON - People, including fictional
EU -> EU - ORG - Companies, agencies, institutions, etc.
Italy -> Italy - GPE - Countries, cities, states
Amazon.com -> Amazon.com - ORG - Companies, agencies, institutions, etc.


# Spacy has a visualizer for ents labels

In [16]:
from spacy import displacy
displacy.render(cp, style="ent", jupyter=True)

# Domanda 6

POS and NER are not the same thing. Knowing that a word is a noun and knowing that a word is a noun wich represents a politician, a company or an organization is totatally different and less informative.

# Sentence Segmentation

In [17]:
fr = nlp("I ain't mad. I'mm just asking why the fuck you don't answer fans? You could have signed an autograph for Matthew")
list(fr.sents)

#we can check if a token is the first token of a sequence as follows
for t in fr:
    print("{} -> {}".format(t.text, t.is_sent_start))

I -> True
ai -> None
n't -> None
mad -> None
. -> None
I'mm -> True
just -> None
asking -> None
why -> None
the -> None
fuck -> None
you -> None
do -> None
n't -> None
answer -> None
fans -> None
? -> None
You -> True
could -> None
have -> None
signed -> None
an -> None
autograph -> None
for -> None
Matthew -> None


# Domanda 7

In [18]:
x = "I ain't mad. I'mm just asking why the fuck you don't answer fans? You could have signed an autograph for Matthew"
x.split('.')

["I ain't mad",
 " I'mm just asking why the fuck you don't answer fans? You could have signed an autograph for Matthew"]

As we can see it's not the same thing. It does not just not split the sentence with other punctuation but if we have some number it would have split the phrase giving it no sense

# Bag of Words Representation

In [19]:
#load the csv file with pandas as DataFrame
import pandas as pd

spam = pd.read_csv('spam.csv', encoding='latin-1')

#we are going to use just the first two columns
spam = spam[['v1','v2']]
spam = spam.rename(columns={'v1':'class', 'v2':'text'})
spam.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
#Inspect some information
print(spam.iloc[0]['class'], '----', spam.iloc[0]['text'])

ham ---- Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


# Splitting the dataset into training and test set

In [21]:
#we can split the dataset in order to work with machine learning as follows
from sklearn.model_selection import train_test_split
import numpy as np

np.random.seed(1234) #never put the seed, just for instructionally purpose
train_set, test_set = train_test_split(spam, test_size=0.25) #25% for test set
print(train_set.head())
print(test_set.head())

     class                                               text
5062   ham               Ok i also wan 2 watch e 9 pm show...
39     ham  Hello! How's you and how did saturday go? I wa...
4209   ham  No da:)he is stupid da..always sending like th...
4500   ham                              So wat's da decision?
3578   ham  Multiply the numbers independently and count d...
     class                                               text
1537   ham  All sounds good. Fingers . Makes it difficult ...
963    ham  Yo chad which gymnastics class do you wanna ta...
4421   ham            MMM ... Fuck .... Merry Christmas to me
46     ham      Didn't you get hep b immunisation in nigeria.
581    ham     Ok anyway no need to change with what you said


In [22]:
spam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
class    5572 non-null object
text     5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB


In [23]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4179 entries, 5062 to 2863
Data columns (total 2 columns):
class    4179 non-null object
text     4179 non-null object
dtypes: object(2)
memory usage: 97.9+ KB


In [24]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1393 entries, 1537 to 4118
Data columns (total 2 columns):
class    1393 non-null object
text     1393 non-null object
dtypes: object(2)
memory usage: 32.6+ KB


In [25]:
print(spam.head())
print('------------------')
print(train_set.head())
print('-------------')
print(test_set.head())

  class                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
------------------
     class                                               text
5062   ham               Ok i also wan 2 watch e 9 pm show...
39     ham  Hello! How's you and how did saturday go? I wa...
4209   ham  No da:)he is stupid da..always sending like th...
4500   ham                              So wat's da decision?
3578   ham  Multiply the numbers independently and count d...
-------------
     class                                               text
1537   ham  All sounds good. Fingers . Makes it difficult ...
963    ham  Yo chad which gymnastics class do you wanna ta...
4421   ham            MMM ... Fuck .... Merry Christmas

# Domanda 8

The indexes have been randomly sorted in order to reduce the possibility that the algorithm will work for every input and not just for those of the train set

# Check how many elements belong to each category

In [26]:
from matplotlib import pyplot as plt
train_set.groupby('class').count().plot.bar()
plt.show()

<Figure size 640x480 with 1 Axes>

# Domanda 9

This is something we have to check in order to be able to give the algorithm a good accuracy because if we have many examples from a class and very low examples of the other class it could say that everything is just from the biggest class

# Domanda 10

x_train is represented as a sparse matrix because we will only have a small amount of non zero elements for each word. If we'd represent the dense matrix we'd need an nxn matrix 

# Domanda 11

This is not a good approach, in fact since our dataset is very unbalanced it could deviate our accuracy. If we'd take a classifier which classifies messages as ham we'd still have a good accuracy since our dataset is unbalanced.

# Domanda 12

We are learning that we have rightly classified 1188 + 130 emails but we got wrong in 75 cases.

# Domanda 14

The 1NN worked better than the 5NN

# Domanda 15

It seems that the MAP works the best for this dataset. The margin is actually large since we have a 10% increase. This means that both recall and precision are higher in value, so the algorithm works better.

# Tokenizating and counting words with CountVectorizer

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer() #an object which automatically create a bag of word representation

#we can add the vocabulary using the fit method

count_vect.fit(['this is', 'a list of', 'short messages'])
count_vect.vocabulary_ #a dictionary which maps each word to a unique identifier. It has also removed stop words

#transform text using transform
features = count_vect.transform(['this is', 'a list of', 'short messages'])
# it gives a sparse matrix where each row corresponds to a document and avery index represent the index of the word
features = features.todense()
print('this is', features[0])

this is [[1 0 0 0 0 1]]


In [28]:
train_count = count_vect.fit_transform(train_set['text'])
len(count_vect.vocabulary_)

7398

# KNeighbor and MAP Naive Bayes

In [29]:
#import knn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)

In [30]:
knn.fit(train_count, train_set['class'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [31]:
message = test_set.iloc[260]['text']
print([message])

['FREE MSG:We billed your mobile number by mistake from shortcode 83332.Please call 08081263000 to have charges refunded.This call will be free from a BT landline']


In [32]:
x_test = count_vect.transform(test_set['text'])
y_test = knn.predict(x_test)
y_test

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [33]:
acc = knn.score(x_test, test_set['class'])

In [34]:
acc

0.9461593682699211

In [35]:
from sklearn.metrics import f1_score

score = f1_score(test_set['class'], y_test, average=None, labels=['ham','spam'])
score

array([0.96940024, 0.7761194 ])

In [36]:
score.mean()

0.8727598238915582

In [37]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(train_count, train_set['class'])
x_test_nb = nb.predict(x_test)
score_nav = f1_score(test_set['class'], x_test_nb, average=None, labels=['ham','spam'])
score_nav

array([0.99204022, 0.95238095])

In [38]:
score_nav.mean()

0.972210585113811

# Weight words frequency with TF-IDF

In [42]:
from sklearn.feature_extraction.text import TfidfTransformer
c_vect = CountVectorizer()
tf_transformer = TfidfTransformer()

train_count = c_vect.fit_transform(train_set['text'])
tf_train_count = tf_transformer.fit_transform(train_count)

test_count = c_vect.transform(test_set['text'])
tf_test_count = tf_transformer.fit_transform(test_count)

classifier  = KNeighborsClassifier(n_neighbors=1)
classifier.fit(tf_train_count, train_set['class'])

y_tf_pred = classifier.predict(tf_test_count)
f1_scores = f1_score(test_set['class'], y_tf_pred, average=None, labels=['ham', 'spam'])
f1_scores.mean()
f1_scores

array([0.97061224, 0.78571429])

# Domanda 16

In [41]:
from sklearn.feature_extraction.text import TfidfTransformer
c_vect = CountVectorizer()
tf_transformer = TfidfTransformer()

train_count = c_vect.fit_transform(train_set['text'])
tf_train_count = tf_transformer.fit_transform(train_count)

test_count = c_vect.transform(test_set['text'])
tf_test_count = tf_transformer.fit_transform(test_count)

classifier  = MultinomialNB()
classifier.fit(tf_train_count, train_set['class'])

y_tf_pred = classifier.predict(tf_test_count)
f1_scores = f1_score(test_set['class'], y_tf_pred, average=None, labels=['ham', 'spam'])
f1_scores.mean()
f1_scores

array([0.97538966, 0.82758621])

We obtain the best results using the word count with the multinomial classifier

# N-Grams

In [48]:
count_vect = CountVectorizer(ngram_range=(2,2))
count_vect.fit(train_set['text'])
list(count_vect.vocabulary_.keys())[:10]

['ok also',
 'also wan',
 'wan watch',
 'watch pm',
 'pm show',
 'hello how',
 'how you',
 'you and',
 'and how',
 'how did']

In [55]:
count_vect = CountVectorizer(ngram_range=(1,3))

x_train = count_vect.fit_transform(train_set['text'])
x_test = count_vect.transform(test_set['text'])

classifier = MultinomialNB()
classifier.fit(x_train, train_set['class'])
y_pred = classifier.predict(x_test)

f1score = f1_score(test_set['class'], y_test, average=None, labels=['ham','spam'])
f1score

array([0.96940024, 0.7761194 ])

# Custom Tokenization

In [59]:
#sklearn doesn't provide tool for stemming etc. we need to
#create a custo countvectorizer

class POSTokenizer(object):
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')
    def __call__(self, doc):
        return [t.pos_ for t in self.nlp(doc)]

In [60]:
tokenizer = POSTokenizer()
tokenizer("Hi motherfucker, how are you?")

['INTJ', 'NOUN', 'PUNCT', 'ADV', 'AUX', 'PRON', 'PUNCT']

In [61]:
# we ca create the countvectorizer as follows

count_vect = CountVectorizer(tokenizer = POSTokenizer())

x_train = count_vect.fit_transform(train_set['text'])
x_test = count_vect.transform(test_set['text'])

classifier = MultinomialNB()
classifier.fit(x_train, train_set['class'])

y_test = classifier.predict(x_test)

f1score = f1_score(test_set['class'], y_test, average=None, labels=['ham','spam'])

f1score

array([0.94273504, 0.69955157])

In [62]:
# we now have 18 features (POS)
x_train.shape

#let's combine those with the previous word counts

(4179, 18)

In [63]:
from scipy.sparse import hstack

count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(train_set['text'])
x_train = hstack([x_train_counts, x_train])
print(x_train_counts.shape, x_train.shape)

(4179, 7398) (4179, 7416)


In [None]:
class POSTokenizer(object):
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')
    def __call__(self, doc):
        return [t.pos_ for t in self.nlp(doc)]

count_vect = CountVectorizer()
count_vect_pos = CountVectorizer(tokenizer=POSTokenizer())

x_train_count = count_vect.fit_transform(train_set['text']) #FACCIAMO FIT IN MODO DA INSERIRE LE PAROLE NEL VOCABOLARIO
x_train_pos = count_vect_pos.fit_transform(train_set['text'])

x_test_count = count_vect.transform(test_set['text'])
x_test_pos = count_vect_pos.transform(test_set['text'])

x_train = hstack([x_train_count, x_train_pos])
x_test = hstack([x_test_count, x_test_pos])

classifier = MultinomialNB()
classifier.fit(x_train, train_set['class'])

y_test = classifier.predict(x_test)
f1score = f1_score(test_set['class'], y_test, average=None, labels=['ham','spam'])
f1score
