## Coding Excercise #0710

In [1]:
import nltk
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

### 1. TF IDF representation:

#### 1.1. Create a TF IDF matrix: 

In [29]:
# The data.
my_docs = ["The economic slowdown is becoming more severe",
           "The movie was simply awesome",
           "I like cooking my own food",
           "Samsung is announcing a new technology",
           "Machine Learning is an example of awesome technology",
           "All of us were excited at the movie",
           "We have to do more to reverse the economic slowdown"]

In [30]:
# A very simple pre-processing.
my_docs = [x.lower() for x in my_docs]

In [31]:
my_docs #docs converted to lower

['the economic slowdown is becoming more severe',
 'the movie was simply awesome',
 'i like cooking my own food',
 'samsung is announcing a new technology',
 'machine learning is an example of awesome technology',
 'all of us were excited at the movie',
 'we have to do more to reverse the economic slowdown']

TfidfVectorizer() arguments: <br>
- *max_features* : maximum number of features (distict words). <br>
- *min_df* : The minimum DF. Integer value means count and real number (0~1) means proportion. <br> 
- *max_df* : The maximum DF. Integer value means count and real number (0~1) means proportion. Helps to filter out the stop words. <br> 

In [32]:
vectorizer = TfidfVectorizer(max_features = 10, min_df = 1, max_df = 3, stop_words = stopwords.words('english'))

In [33]:
vectorizer

TfidfVectorizer(max_df=3, max_features=10,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...])

In [34]:
X = vectorizer.fit_transform(my_docs).toarray()               # Conversion to a dense matrix with toarray().

In [35]:
# Output the features.
print(vectorizer.get_feature_names_out())

['announcing' 'awesome' 'economic' 'movie' 'reverse' 'samsung' 'severe'
 'simply' 'slowdown' 'technology']


In [36]:
# Size of the X matrix (m x n).
X.shape

(7, 10)

In [37]:
X

array([[0.        , 0.        , 0.53828134, 0.        , 0.        ,
        0.        , 0.64846464, 0.        , 0.53828134, 0.        ],
       [0.        , 0.53828134, 0.        , 0.53828134, 0.        ,
        0.        , 0.        , 0.64846464, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.60981929, 0.        , 0.        , 0.        , 0.        ,
        0.60981929, 0.        , 0.        , 0.        , 0.50620239],
       [0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.70710678],
       [0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.53828134, 0.        , 0.64846464,
        0.        , 0.        , 0.        , 0.53828134, 0.        ]])

In [38]:
# Output a vector corresponding to a document.
print(X[0])

[0.         0.         0.53828134 0.         0.         0.
 0.64846464 0.         0.53828134 0.        ]


#### 1.2. Calculate the cosine similarity:

In [12]:
# The cosine similarity matrix.
1 - np.round(pairwise_distances(X, metric="cosine"),3)

array([[1.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.579],
       [0.   , 1.   , 0.   , 0.   , 0.381, 0.538, 0.   ],
       [0.   , 0.   , 1.   , 0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 1.   , 0.358, 0.   , 0.   ],
       [0.   , 0.381, 0.   , 0.358, 1.   , 0.   , 0.   ],
       [0.   , 0.538, 0.   , 0.   , 0.   , 1.   , 0.   ],
       [0.579, 0.   , 0.   , 0.   , 0.   , 0.   , 1.   ]])

In [13]:
# Cosine similarity between two documents by linear algebra.
np.dot(X[0], X[6].T)

0.5794936078209331

In [14]:
# Cosine similarity between two documents by linear algebra.
np.dot(X[3], X[4].T)

0.35793913951147677

#### Sample

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?']

In [16]:
vectorizer = CountVectorizer()

In [17]:
vectorizer

CountVectorizer()

In [18]:
X = vectorizer.fit_transform(corpus)
X

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [19]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [20]:
print(X.toarray()) 

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [22]:
print(X[0].toarray()) # FIRST SENTENCE DOESNT HAVE 'AND' SO '0', 'DOCUMENT', 'FIRST', 'IS' THERE SO 1, ELSE O.. GOES BY ORDER OF VECTORIZER.GET_FEATURE_NAMES_OUT().

[[0 1 1 1 0 0 1 0 1]]


In [23]:
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
vectorizer2

CountVectorizer(ngram_range=(2, 2))

In [24]:
X2 = vectorizer2.fit_transform(corpus)
X2

<4x13 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [25]:
vectorizer2.get_feature_names_out()

array(['and this', 'document is', 'first document', 'is the', 'is this',
       'second document', 'the first', 'the second', 'the third',
       'third one', 'this document', 'this is', 'this the'], dtype=object)

In [26]:
print(X2.toarray())

[[0 0 1 1 0 0 1 0 0 0 0 1 0]
 [0 1 0 1 0 1 0 1 0 0 1 0 0]
 [1 0 0 1 0 0 0 0 1 1 0 1 0]
 [0 0 1 0 1 0 1 0 0 0 0 0 1]]
