In [2]:
from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
corpus = [
'All my cats in a row',
'When my cat sits down, she looks like a Furby toy!',
'The cat from outer space',
'Sunshine loves to sit like this for some reason.'
]

In [254]:
# Convert a collection of text documents to a matrix of token counts
# If you do not provide an a-priori dictionary and you do not use an analyzer that does some kind of feature selection 
# then the number of features will be equal to the vocabulary size found by analyzing the data.
vectorizer = CountVectorizer()
vectorizer


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [255]:
# Learn a vocabulary dictionary of all tokens in the raw documents.
vectorizer.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [256]:
# Access the result of the tokenization
vectorizer.vocabulary_

{'all': 0,
 'my': 11,
 'cats': 2,
 'in': 7,
 'row': 14,
 'when': 25,
 'cat': 1,
 'sits': 17,
 'down': 3,
 'she': 15,
 'looks': 9,
 'like': 8,
 'furby': 6,
 'toy': 24,
 'the': 21,
 'from': 5,
 'outer': 12,
 'space': 19,
 'sunshine': 20,
 'loves': 10,
 'to': 23,
 'sit': 16,
 'this': 22,
 'for': 4,
 'some': 18,
 'reason': 13}

In [257]:
# Transform documents to document-term matrix
vector = vectorizer.transform(corpus)
vector

<4x26 sparse matrix of type '<class 'numpy.int64'>'
	with 29 stored elements in Compressed Sparse Row format>

In [258]:
# summarize encoded vector
print(vector.shape)
# --> 4 documents and 26 tokens
# --> a vector of 26 elements (or features) for each document (4 documents in total)
# --> a matrix of 4 rows and 26 columns
print(type(vector))

(4, 26)
<class 'scipy.sparse.csr.csr_matrix'>


In [259]:
# array representation of matrix
# position in array = index or id in vocabulary
vector.toarray()

array([[1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0],
       [0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
        0, 0, 1, 1],
       [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
        0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
        1, 1, 0, 0]])

In [8]:
# apply same vectorizer on new input
corpus2 = ["the cute kitty cat"]
vectorizer.transform(corpus2).toarray()

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0]])

In [9]:
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances


In [233]:
features = vectorizer.fit_transform(corpus)
print(features)


  (0, 1)	0.6279137616509933
  (0, 10)	0.7782829228046183
  (1, 0)	0.2965698850220162
  (1, 12)	0.4428321995085722
  (1, 6)	0.4428321995085722
  (1, 5)	0.35727423026525224
  (1, 2)	0.4428321995085722
  (1, 15)	0.4428321995085722
  (2, 0)	0.42799292268317357
  (2, 8)	0.6390704413963749
  (2, 13)	0.6390704413963749
  (3, 5)	0.3741047724501572
  (3, 14)	0.4636932227319092
  (3, 7)	0.4636932227319092
  (3, 11)	0.4636932227319092
  (3, 9)	0.4636932227319092
  (4, 1)	0.45827018116532225
  (4, 0)	0.38040564760664297
  (4, 4)	0.5680140774328015
  (4, 3)	0.5680140774328015


In [236]:
features = vectorizer.fit_transform(corpus).todense()
print(features)

[[0.         0.62791376 0.         0.         0.         0.
  0.         0.         0.         0.         0.77828292 0.
  0.         0.         0.         0.        ]
 [0.29656989 0.         0.4428322  0.         0.         0.35727423
  0.4428322  0.         0.         0.         0.         0.
  0.4428322  0.         0.         0.4428322 ]
 [0.42799292 0.         0.         0.         0.         0.
  0.         0.         0.63907044 0.         0.         0.
  0.         0.63907044 0.         0.        ]
 [0.         0.         0.         0.         0.         0.37410477
  0.         0.46369322 0.         0.46369322 0.         0.46369322
  0.         0.         0.46369322 0.        ]
 [0.38040565 0.45827018 0.         0.56801408 0.56801408 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]]


# Expliquer la différence entre 
$vectorizer.fit_transform(corpus)$
<br>
$vectorizer.fit_transform(corpus).todense()$

In [260]:
for i in range(len(features)):
    counter =0
    for f in features:
        print("euclidean distance between vector {} and vector {}".format(i,counter))
        print( euclidean_distances(features[i], f) )
        print("cosine similarity between vector {} and vector {}".format(i,counter))
        print( cosine_similarity(features[i],f) )
        print( "manhattan distance between vector {} and vector {}".format(i,counter))
        print( manhattan_distances(features[i],f))
        counter +=1
        print("----")

euclidean distance between vector 0 and vector 0
[[0.]]
cosine similarity between vector 0 and vector 0
[[1.]]
manhattan distance between vector 0 and vector 0
[[0.]]
----
euclidean distance between vector 0 and vector 1
[[1.41421356]]
cosine similarity between vector 0 and vector 1
[[0.]]
manhattan distance between vector 0 and vector 1
[[3.8313696]]
----
euclidean distance between vector 0 and vector 2
[[1.41421356]]
cosine similarity between vector 0 and vector 2
[[0.]]
manhattan distance between vector 0 and vector 2
[[3.11233049]]
----
euclidean distance between vector 0 and vector 3
[[1.41421356]]
cosine similarity between vector 0 and vector 3
[[0.]]
manhattan distance between vector 0 and vector 3
[[3.63507435]]
----
euclidean distance between vector 0 and vector 4
[[1.19352071]]
cosine similarity between vector 0 and vector 4
[[0.28775415]]
manhattan distance between vector 0 and vector 4
[[2.46436031]]
----
euclidean distance between vector 1 and vector 0
[[1.41421356]]
cosin

#### Rappel
##### Euclidean distance : dissimilarity measure
On a deux points dans un espace donné, quelle est le chemin le plus rapide pour alle d'un point à l'autre ? => Une droite
Cette droite c'est la distance euclidienne, elle permet de savoir à quel point deux points sont éloignés l'un de l'autre
Pour calculer cette droite

<br>
exemple
<br>
On a deux points q et p
<br>
q = (1,3)
<br>
p = (2,5)
<br>
dist(q,p) = √(q<sub>1</sub> - p<sub>1</sub>)² + (q<sub>2</sub> - p<sub>2</sub>)²
<br>
          = √(1 - 2)² + (3 - 5)²
<br>
          = √(1 - 2)² + (3 - 5)²
<br>
          = √(-1)² + (-2)²
<br>
          = √1 + 4
<br>
          = 2.23
<br>
Autre exemple avec 3 dimensions pour chaque point :
<br>
dist((2, -1, 3), (-2, 2, 0))	= √(2 - (-2))² + ((-1) - 2)² + (3 - 0)²
<br>
                                = √(2 + 2)² + (-1 - 2)² + (3 - 0)²
<br>
                                = √(4)² + (-3)² + (3)²
<br>
                                = √16 + 9 + 9
<br>
                                = √34
<br>
                                = 5.83

# Même chose pour manhattan et cosine
définitions et exemples

In [261]:
from math import sqrt # => pour square root

def euclidian_dist(p1,p2): 
    dist = sqrt(sum([(a - b) ** 2 for a, b in zip(p1, p2)]))
    return dist

plot1=(1,3)
plot2=(2,5)
print(euclidian_dist(plot1,plot2))
#2.23606797749979
print(euclidian_dist((2, -1, 3), (-2, 2, 0)))
#5.830951894845301

print('---------------')
# équivalent à 
print(sqrt((plot1[0]-plot2[0])**2 + (plot1[1]-plot2[1])**2))
#2.23606797749979
print(sqrt((2-(-2))**2 + ((-1)-2)**2 + (3-0)**2))
#5.830951894845301


2.23606797749979
5.830951894845301
---------------
2.23606797749979
5.830951894845301


# Même chose pour manhattan et cosine¶

fonctions from scratch


In [278]:
from sklearn.metrics.pairwise import euclidean_distances
from scipy import spatial
import numpy

for i in range(len(features)):
    counter =0
    for f in features:
        print("euclidean distance between vector {} and vector {} using sklearn".format(i,counter))
        print( euclidean_distances(features[i], f) )
        print(type(euclidean_distances(features[i], f)))
        print("euclidean distance between vector {} and vector {} using scipy".format(i,counter))
        print(spatial.distance.euclidean(features[i],f))
        print(type(spatial.distance.euclidean(features[i],f)))
        print("euclidean distance between vector {} and vector {} using numpy".format(i,counter))
        print(numpy.sqrt(numpy.sum(((numpy.array(features[i])-numpy.array(f))**2))))
        print(type(numpy.sqrt(numpy.sum(((numpy.array(features[i])-numpy.array(f))**2))))) 
        
        ####### print("euclidean distance between vector {} and vector {} using math".format(i,counter))
        #print(sqrt(sum(((features[i]-f)**2))))
        
        #print("*** math sum ***")
        #print(sum(((numpy.array(features[i])-numpy.array(f))**2)))
        
        #print("*** numpy sum ***")
        #print(numpy.sum(((numpy.array(features[i])-numpy.array(f))**2)))
     
        
        counter+=1
        print("----")
        
        
      
        
        
        

euclidean distance between vector 0 and vector 0 using sklearn
[[0.]]
<class 'numpy.ndarray'>
euclidean distance between vector 0 and vector 0 using scipy
0.0
<class 'float'>
euclidean distance between vector 0 and vector 0 using numpy
0.0
<class 'numpy.float64'>
----
euclidean distance between vector 0 and vector 1 using sklearn
[[1.41421356]]
<class 'numpy.ndarray'>
euclidean distance between vector 0 and vector 1 using scipy
1.414213562373095
<class 'float'>
euclidean distance between vector 0 and vector 1 using numpy
1.414213562373095
<class 'numpy.float64'>
----
euclidean distance between vector 0 and vector 2 using sklearn
[[1.41421356]]
<class 'numpy.ndarray'>
euclidean distance between vector 0 and vector 2 using scipy
1.414213562373095
<class 'float'>
euclidean distance between vector 0 and vector 2 using numpy
1.414213562373095
<class 'numpy.float64'>
----
euclidean distance between vector 0 and vector 3 using sklearn
[[1.41421356]]
<class 'numpy.ndarray'>
euclidean distance 

# Bonus
expliquer et corriger l'erreur produite par la ligne  
$print(sqrt(sum(((numpy.array(features[i])-numpy.array(f))**2))))$

# Même chose pour manhattan et cosine¶
utilisation de librairies 

In [204]:
# TFID : occurrences d'un mot dans un document et rareté de la fréquence de ce même mot dans le corpus
# IDF = on calcule le rapport entre nombre de document dans le corpus et le nombre de documents du corpus où le mot apparaît
#      on calcule ensuite le logaritme de ce rapport 
#      si un mot apparaît dans tous les textes (n textes), alors le rapport donnera le résultat : n/n = 1 or ln(1)= 0 
#      ainsi les termes qui apparaîssent partout seront neutralisés

# TfidfVectorizer : au lieu d'un vecteur avec des booléens, on a un vecteur avec le scrore de tf-idf de chaque mot
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
'All my cats in a row',
'When my cat sits down, she looks like a Furby toy!',
'The cat from outer space',
'Sunshine loves to sit like this for some reason.',
'All kittens are cats but not every cat is a kitten.'
]

vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
print(vectorizer.vocabulary_)
print(vectorizer.idf_)
vector = vectorizer.transform(corpus)
print(vector.shape)
print(vector.toarray())

{'all': 0, 'my': 17, 'cats': 4, 'in': 10, 'row': 21, 'when': 32, 'cat': 3, 'sits': 24, 'down': 5, 'she': 22, 'looks': 15, 'like': 14, 'furby': 9, 'toy': 31, 'the': 28, 'from': 8, 'outer': 19, 'space': 26, 'sunshine': 27, 'loves': 16, 'to': 30, 'sit': 23, 'this': 29, 'for': 7, 'some': 25, 'reason': 20, 'kittens': 13, 'are': 1, 'but': 2, 'not': 18, 'every': 6, 'is': 11, 'kitten': 12}
[1.69314718 2.09861229 2.09861229 1.40546511 1.69314718 2.09861229
 2.09861229 2.09861229 2.09861229 2.09861229 2.09861229 2.09861229
 2.09861229 2.09861229 1.69314718 2.09861229 2.09861229 1.69314718
 2.09861229 2.09861229 2.09861229 2.09861229 2.09861229 2.09861229
 2.09861229 2.09861229 2.09861229 2.09861229 2.09861229 2.09861229
 2.09861229 2.09861229 2.09861229]
(5, 33)
[[0.40580082 0.         0.         0.         0.40580082 0.
  0.         0.         0.         0.         0.50297966 0.
  0.         0.         0.         0.         0.         0.40580082
  0.         0.         0.         0.50297966 0. 

# Commenter les lignes précédentes et expliquer les prints

# Définition word2vec
(pas de copier-coller d'internet)

In [279]:
from gensim.models import word2vec

corpus = [
'All my cats in a row',
'When my cat sits down, she looks like a Furby toy!',
'The cat from outer space',
'Sunshine loves to sit like this for some reason.',
'All kittens are cats but not every cat is a kitten.'
]
corpus_tokens = [sentence.split() for sentence in corpus]
model = word2vec.Word2Vec(corpus_tokens, min_count=1)
print(model)

Word2Vec(vocab=34, size=100, alpha=0.025)


# Comment fonctionne word2vec dans gensim
 Que fait $word2vec.Word2Vec()$
 <br>
 Quels sont les paramètres de $word2vec.Word2Vec()$
 <br>
 expliquer le résultat de $print(model)$


In [277]:
vector1 = model.wv['cat']
vector2 = model.wv['cats']
#print(vector1)
#print('---')
#print(vector2)
#print("euclidean distance between vector {} and vector {} using sklearn".format("vector1","vector2"))
#print(euclidean_distances(vector1, vector2))
#indice : 
#(row, column)

# Expliquer tout le code
que fait $model.wv[]$
<br>
qu'affichent les $print(vector1)$ et $print(vector2)$
<br>
expliquer l'erreur provoquée par $euclidean$_$distances(vector1, vector2)$
<br>
corriger l'erreur
<br>
utiliser manhattan et cosine