# Série TP 6 - Analyse Sémantique 

## Word Embedding - Vectorisation de mots: techniques traditionnelles (Frequency Based). 


### Imports

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.corpus import stopwords

from numpy.linalg import norm
from numpy import argmax

### 1 - One-Hot Encoding

#### Text Data

In [2]:
doc1 = "they are playing football"
doc2 = "they are playing cricket"

In [3]:
vocab1 = word_tokenize(doc1)
vocab2 = word_tokenize(doc2)

In [4]:
vocab = list(set(vocab1 + vocab2))
print(vocab)

['they', 'are', 'cricket', 'football', 'playing']


#### Label Encoding - Creating an integer encoding of words

In [5]:
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(vocab)
print(vocab, integer_encoded)

['they', 'are', 'cricket', 'football', 'playing'] [4 0 1 2 3]


In [6]:
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
print(integer_encoded)

[[4]
 [0]
 [1]
 [2]
 [3]]


#### One-Hot Encoding

In [7]:
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

print(onehot_encoded)

[[0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]]


In [8]:
df = pd.DataFrame(onehot_encoded, index=vocab)
df

Unnamed: 0,0,1,2,3,4
they,0.0,0.0,0.0,0.0,1.0
are,1.0,0.0,0.0,0.0,0.0
cricket,0.0,1.0,0.0,0.0,0.0
football,0.0,0.0,1.0,0.0,0.0
playing,0.0,0.0,0.0,1.0,0.0


### 2 - Count Vectorizer - Term-Doc Matrix

#### Text Data

In [9]:
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document? No, this is not the first one. That one is.',
]

#### Count Vectorizer

In [10]:
cv = CountVectorizer()
count_vector = cv.fit_transform(corpus)

In [11]:
# Corpus Shape : 5 docs and 43 unique words
count_vector.shape

(4, 12)

In [12]:
# Show resulting vocabulary : the numbers are not counts, they are the position in the sparse vector
cv.vocabulary_

{'this': 11,
 'is': 3,
 'the': 9,
 'first': 2,
 'document': 1,
 'second': 7,
 'and': 0,
 'third': 10,
 'one': 6,
 'no': 4,
 'not': 5,
 'that': 8}

In [13]:
cv.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'no', 'not', 'one', 'second',
       'that', 'the', 'third', 'this'], dtype=object)

In [14]:
count_vector.toarray()

array([[0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1],
       [1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1],
       [0, 1, 2, 3, 1, 1, 2, 0, 1, 2, 0, 2]], dtype=int64)

In [15]:
df = pd.DataFrame(count_vector.toarray(), columns=cv.get_feature_names_out())
df

Unnamed: 0,and,document,first,is,no,not,one,second,that,the,third,this
0,0,1,1,1,0,0,0,0,0,1,0,1
1,0,2,0,1,0,0,0,1,0,1,0,1
2,1,0,0,1,0,0,1,0,0,1,1,1
3,0,1,2,3,1,1,2,0,1,2,0,2


#### CountVect without stopwords 

In [16]:
sw = stopwords.words('english')

In [17]:
sw_cv = CountVectorizer(stop_words=sw)
sw_count_vector = sw_cv.fit_transform(corpus)

In [18]:
print(sw_cv.stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [19]:
sw_count_vector.shape

(4, 5)

In [20]:
sw_cv.get_feature_names_out()

array(['document', 'first', 'one', 'second', 'third'], dtype=object)

In [21]:
sw_count_vector.toarray()

array([[1, 1, 0, 0, 0],
       [2, 0, 0, 1, 0],
       [0, 0, 1, 0, 1],
       [1, 2, 2, 0, 0]], dtype=int64)

### 3 - N-grams - Term-Doc Matrix

In [22]:
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document? No, this is not the first one. That one is.',
]

#### Bi-grams Only : ngram_range=(2, 2)

In [23]:
cv_ngrams = CountVectorizer(analyzer='word', ngram_range=(2, 2))
count_vector_ngrams = cv_ngrams.fit_transform(corpus)

In [24]:
cv_ngrams.get_feature_names_out()

array(['and this', 'document is', 'document no', 'first document',
       'first one', 'is not', 'is the', 'is this', 'no this', 'not the',
       'one is', 'one that', 'second document', 'that one', 'the first',
       'the second', 'the third', 'third one', 'this document', 'this is',
       'this the'], dtype=object)

In [25]:
count_vector_ngrams.shape

(4, 21)

In [26]:
count_vector_ngrams.toarray()

array([[0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0],
       [0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 2, 0, 0, 0, 0, 1, 1]],
      dtype=int64)

#### Unigrams and Bi-grams : ngram_range=(1,  2)

In [27]:
cv_ngrams2 = CountVectorizer(analyzer='word', ngram_range=(1, 2)) # unigrams and bi-gram 
count_vector_ngrams2 = cv_ngrams2.fit_transform(corpus)
cv_ngrams2.get_feature_names_out()

array(['and', 'and this', 'document', 'document is', 'document no',
       'first', 'first document', 'first one', 'is', 'is not', 'is the',
       'is this', 'no', 'no this', 'not', 'not the', 'one', 'one is',
       'one that', 'second', 'second document', 'that', 'that one', 'the',
       'the first', 'the second', 'the third', 'third', 'third one',
       'this', 'this document', 'this is', 'this the'], dtype=object)

### 4 - TF-IDF Vectorizer - Term-Doc Matrix

In [28]:
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document? No, this is not the first one. That one is.',
]

In [29]:
tfidf = TfidfVectorizer(norm = None)
tfidf_vector = tfidf.fit_transform(corpus)

In [30]:
tfidf_vector.shape

(4, 12)

In [31]:
# Computation of IDF(term, Corpus) in sklearn - default : idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1
tfidf.idf_

array([1.91629073, 1.22314355, 1.51082562, 1.        , 1.91629073,
       1.91629073, 1.51082562, 1.91629073, 1.91629073, 1.        ,
       1.91629073, 1.        ])

In [32]:
tfidf.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'no', 'not', 'one', 'second',
       'that', 'the', 'third', 'this'], dtype=object)

In [33]:
for word, idf in zip(tfidf.get_feature_names_out(), tfidf.idf_):
    print(word, ':', idf)

and : 1.916290731874155
document : 1.2231435513142097
first : 1.5108256237659907
is : 1.0
no : 1.916290731874155
not : 1.916290731874155
one : 1.5108256237659907
second : 1.916290731874155
that : 1.916290731874155
the : 1.0
third : 1.916290731874155
this : 1.0


In [34]:
tfidf_vector.toarray()

array([[0.        , 1.22314355, 1.51082562, 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 1.        ],
       [0.        , 2.4462871 , 0.        , 1.        , 0.        ,
        0.        , 0.        , 1.91629073, 0.        , 1.        ,
        0.        , 1.        ],
       [1.91629073, 0.        , 0.        , 1.        , 0.        ,
        0.        , 1.51082562, 0.        , 0.        , 1.        ,
        1.91629073, 1.        ],
       [0.        , 1.22314355, 3.02165125, 3.        , 1.91629073,
        1.91629073, 3.02165125, 0.        , 1.91629073, 2.        ,
        0.        , 2.        ]])

In [35]:
df = pd.DataFrame(tfidf_vector.toarray(), columns=tfidf.get_feature_names_out())
df

Unnamed: 0,and,document,first,is,no,not,one,second,that,the,third,this
0,0.0,1.223144,1.510826,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.0,2.446287,0.0,1.0,0.0,0.0,0.0,1.916291,0.0,1.0,0.0,1.0
2,1.916291,0.0,0.0,1.0,0.0,0.0,1.510826,0.0,0.0,1.0,1.916291,1.0
3,0.0,1.223144,3.021651,3.0,1.916291,1.916291,3.021651,0.0,1.916291,2.0,0.0,2.0


#### Hand computation : tf.idf of the term 'first' in doc1

In [36]:
TF_first_doc4 = 2 # term 'first' in the 4th doc
print(TF_first_doc4)

2


In [37]:
from math import log
IDF_first_corpus = log ( (1 + 4) / (1 + 2) ) + 1   # idf(t) = log [ (1 + N) / (1 + df(t)) ] + 1
print(IDF_first_corpus, 'or same as : ', tfidf.idf_[2])

1.5108256237659907 or same as :  1.5108256237659907


In [38]:
TF_IDF_first_doc4 = TF_first_doc4 * IDF_first_corpus
print(TF_IDF_first_doc4, ' same as ', tfidf_vector.toarray()[3, 2])

3.0216512475319814  same as  3.0216512475319814


### 5 - Term-Term Co-occurrence Matrix

In [39]:
from collections import defaultdict

def co_occurrence(sentences, window_size):
    d = defaultdict(int)
    vocab = set()
    for text in sentences:
        # preprocessing (use tokenizer instead)
        text = text.lower().split()
        # iterate over sentences
        for i in range(len(text)):
            token = text[i]
            vocab.add(token)  # add to vocab
            next_token = text[i+1 : i+1+window_size]
            for t in next_token:
                key = tuple( sorted([t, token]) )
                d[key] += 1
    
    # formulate the dictionary into dataframe
    vocab = sorted(vocab) # sort vocab
    df = pd.DataFrame(data=np.zeros((len(vocab), len(vocab)), dtype=np.int16),
                      index=vocab,
                      columns=vocab)
    for key, value in d.items():
        df.at[key[0], key[1]] = value
        df.at[key[1], key[0]] = value
    return df

In [40]:
docs = ['I like deep learning',
        'I like NLP',
        'I enjoy flying']
df = co_occurrence(docs, 1)
df

Unnamed: 0,deep,enjoy,flying,i,learning,like,nlp
deep,0,0,0,0,1,1,0
enjoy,0,0,1,1,0,0,0
flying,0,1,0,0,0,0,0
i,0,1,0,0,0,2,0
learning,1,0,0,0,0,0,0
like,1,0,0,2,0,0,1
nlp,0,0,0,0,0,1,0


In [41]:
docs = ['He is not lazy He is intelligent He is smart']
df = co_occurrence(docs, 2)
df

Unnamed: 0,he,intelligent,is,lazy,not,smart
he,0,2,4,1,2,1
intelligent,2,0,2,0,0,0
is,4,2,0,2,1,1
lazy,1,0,2,0,1,0
not,2,0,1,1,0,0
smart,1,0,1,0,0,0


### Cosine Similarity

In [42]:
corpus = [
     'This is the first document.',
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document? No, this is not the first one. That one is.',
]

In [43]:
tfidf = TfidfVectorizer(norm = None)
tfidf_vector = tfidf.fit_transform(corpus)

In [44]:
df = pd.DataFrame(tfidf_vector.toarray(), columns=tfidf.get_feature_names_out())
df

Unnamed: 0,and,document,first,is,no,not,one,second,that,the,third,this
0,0.0,1.182322,1.405465,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.0,1.182322,1.405465,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.0,2.364643,0.0,1.0,0.0,0.0,0.0,2.098612,0.0,1.0,0.0,1.0
3,2.098612,0.0,0.0,1.0,0.0,0.0,1.693147,0.0,0.0,1.0,2.098612,1.0
4,0.0,1.182322,2.81093,3.0,2.098612,2.098612,3.386294,0.0,2.098612,2.0,0.0,2.0


#### Between two documents

In [45]:
doc1_vect = tfidf_vector.toarray()[0]
doc2_vect = tfidf_vector.toarray()[1]

In [46]:
cosine = np.dot(doc1_vect, doc2_vect) / (norm(doc1_vect) * norm(doc2_vect))
print("Cosine Similarity:", cosine)

Cosine Similarity: 1.0


In [47]:
doc5_vect = tfidf_vector.toarray()[4]

cosine = np.dot(doc1_vect, doc5_vect) / (norm(doc1_vect) * norm(doc5_vect))
print("Cosine Similarity:", cosine)

Cosine Similarity: 0.685081287716366


#### Between two words

In [48]:
vect_of_the = tfidf_vector.toarray()[:, 9]
vect_of_first = tfidf_vector.toarray()[:, 2]

cosine = np.dot(vect_of_the, vect_of_first) / (norm(vect_of_the) * norm(vect_of_first))
print("Cosine Similarity between the and first:", cosine)

cosine = np.dot(vect_of_first, vect_of_first) / (norm(vect_of_first) * norm(vect_of_first))
print("Cosine Similarity between first and itself:", cosine)

Cosine Similarity between the and first: 0.8660254037844386
Cosine Similarity between first and itself: 1.0000000000000002
