In [1]:
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer


In [15]:
text_1 = "The sun is shining"
text_2 = "The weather is sweet"
text_3 = "The sun is shining, the weather is sweet and one and one is two"

text_list = [text_1, text_2, text_3]
text_list

['The sun is shining',
 'The weather is sweet',
 'The sun is shining, the weather is sweet and one and one is two']

## bag-of-words model
### 2 steps:
1. Convert the set of text to unique words.
2. Construct a feature vector from each text that contains the count of word frequency.

In [3]:
# Create a count object from CountVectorizer class
count = CountVectorizer() 

# Use fit_transform method to perform bag-of-words
doc = count.fit_transform(text_list)

### Print number of unique words in the set of list

In [4]:
col_names = count.get_feature_names_out()
print(f'The number of unique words is {len(col_names)}')
col_names

The number of unique words is 9


array(['and', 'is', 'one', 'shining', 'sun', 'sweet', 'the', 'two',
       'weather'], dtype=object)

### Print the feature vector and its value (count of unique words)

In [5]:
vectorised_text = doc.toarray()
vectorised_text

array([[0, 1, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 1],
       [2, 3, 2, 1, 1, 1, 2, 1, 1]], dtype=int64)

### Visualize in pandas dataframe

In [6]:
df1 = pd.DataFrame(vectorised_text, columns=col_names, index= ['text_1', 'text_2', 'text_3'])
df1

Unnamed: 0,and,is,one,shining,sun,sweet,the,two,weather
text_1,0,1,0,1,1,0,1,0,0
text_2,0,1,0,0,0,1,1,0,1
text_3,2,3,2,1,1,1,2,1,1


In [17]:
tfidf_vect = TfidfVectorizer()
x = tfidf_vect.fit_transform(text_list)

In [19]:
x.toarray()

array([[0.  , 0.43, 0.  , 0.56, 0.56, 0.  , 0.43, 0.  , 0.  ],
       [0.  , 0.43, 0.  , 0.  , 0.  , 0.56, 0.43, 0.  , 0.56],
       [0.5 , 0.45, 0.5 , 0.19, 0.19, 0.19, 0.3 , 0.25, 0.19]])

In [20]:
df2 = pd.DataFrame(x.toarray(), columns=col_names, index= ['text_1', 'text_2', 'text_3'])
df2

Unnamed: 0,and,is,one,shining,sun,sweet,the,two,weather
text_1,0.0,0.43,0.0,0.56,0.56,0.0,0.43,0.0,0.0
text_2,0.0,0.43,0.0,0.0,0.0,0.56,0.43,0.0,0.56
text_3,0.5,0.45,0.5,0.19,0.19,0.19,0.3,0.25,0.19


## Lemmatization & Stop-words

### lemmatization convert words into root word in context

### Stop-words are words that are extremely common in texts and has no value to distinguish the types of text/doc

In [7]:
stop = stopwords.words('english')
print(f'There are a total of {len(stop)} words in the list.')
stop[0:10]

There are a total of 179 words in the list.


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

## Term frequency-inverse document frequency (tf-idf)

In [8]:
tfidf = TfidfTransformer(use_idf=True, norm = 'l2', smooth_idf =True)
a = tfidf.fit_transform(doc)
a = a.toarray()

In [9]:
np.set_printoptions(precision=2)
print(a)

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [10]:
pd.options.display.float_format = '{:,.2f}'.format
df = pd.DataFrame(a, columns=col_names, index= ['text_1', 'text_2', 'text_3'])
df

Unnamed: 0,and,is,one,shining,sun,sweet,the,two,weather
text_1,0.0,0.43,0.0,0.56,0.56,0.0,0.43,0.0,0.0
text_2,0.0,0.43,0.0,0.0,0.0,0.56,0.43,0.0,0.56
text_3,0.5,0.45,0.5,0.19,0.19,0.19,0.3,0.25,0.19
