In [1]:
%%time
import numpy as np
import pandas as pd

CPU times: total: 1.36 s
Wall time: 10.4 s


In [2]:
corpus = [
    "Ashi is happy to work in Gurgoan. Work brings happiness to Ashi.", 
    "Happy people work better in Gurgoan. Ashi enjoys the work culture here.", 
    "The work environment in her company makes Ashi very happy.", 
    "Happiness comes from working hard and work life balance."
]
print(corpus)

['Ashi is happy to work in Gurgoan. Work brings happiness to Ashi.', 'Happy people work better in Gurgoan. Ashi enjoys the work culture here.', 'The work environment in her company makes Ashi very happy.', 'Happiness comes from working hard and work life balance.']


In [3]:
import re

corpus_an = []
for text in corpus:
    corpus_an.append(re.sub(r'[^a-zA-Z0-9 ]', '', text)) # Pick only alpha numeric characters from text
print(corpus_an)

['Ashi is happy to work in Gurgoan Work brings happiness to Ashi', 'Happy people work better in Gurgoan Ashi enjoys the work culture here', 'The work environment in her company makes Ashi very happy', 'Happiness comes from working hard and work life balance']


In [4]:
%%time
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

count_vectorizer = CountVectorizer() # Using CountVectorizer
X_count = count_vectorizer.fit_transform(corpus_an)
type(X_count)

CPU times: total: 1.48 s
Wall time: 19.4 s


scipy.sparse._csr.csr_matrix

In [5]:
print("Vocabulary:")
print(count_vectorizer.get_feature_names_out()) # Print 'vocabulary' (unique words in the corpus, in ascending order)
print()
print("Count Vectorizer Matrix:")
print(X_count.toarray())  # Convert the sparse matrix to array for better visibility

Vocabulary:
['and' 'ashi' 'balance' 'better' 'brings' 'comes' 'company' 'culture'
 'enjoys' 'environment' 'from' 'gurgoan' 'happiness' 'happy' 'hard' 'her'
 'here' 'in' 'is' 'life' 'makes' 'people' 'the' 'to' 'very' 'work'
 'working']

Count Vectorizer Matrix:
[[0 2 0 0 1 0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 2 0 2 0]
 [0 1 0 1 0 0 0 1 1 0 0 1 0 1 0 0 1 1 0 0 0 1 1 0 0 2 0]
 [0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 1 0 0 1 0 1 0 1 1 0]
 [1 0 1 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1]]


In [6]:
corpus_combined = ' '.join(corpus_an).lower() # Combining each document/row into a single string
corpus_combined

'ashi is happy to work in gurgoan work brings happiness to ashi happy people work better in gurgoan ashi enjoys the work culture here the work environment in her company makes ashi very happy happiness comes from working hard and work life balance'

In [7]:
%%time
from nltk.probability import FreqDist

freq_dist = FreqDist(corpus_combined.split()) # Calculating most frequent words 
print("Frequency Distribution:", freq_dist.most_common(1)) # Term with the highest frequency

Frequency Distribution: [('work', 6)]
CPU times: total: 1.03 s
Wall time: 10 s


In [8]:
for i in X_count.toarray():
    print(i)

[0 2 0 0 1 0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 2 0 2 0]
[0 1 0 1 0 0 0 1 1 0 0 1 0 1 0 0 1 1 0 0 0 1 1 0 0 2 0]
[0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 1 0 0 1 0 1 0 1 1 0]
[1 0 1 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1]


In [9]:
X_count_array_transposed = X_count.toarray().transpose() # Transposing the X_count array
product_list = []
for i in range(len(X_count_array_transposed)): # Nested for-loop to calculate product row wise (column-wise in X_count array)
    product = 1
    for j in X_count_array_transposed[i]:
        product *= j
    product_list.append(product) # Appending final product to a list

for i, j in enumerate(product_list): # If product is not null => the term appears in ALL documents
    if j != 0:
        print('The term "%s" occurs in all sentences' % count_vectorizer.get_feature_names_out()[i])

The term "work" occurs in all sentences


In [10]:
count_vectorizer.get_feature_names_out()

array(['and', 'ashi', 'balance', 'better', 'brings', 'comes', 'company',
       'culture', 'enjoys', 'environment', 'from', 'gurgoan', 'happiness',
       'happy', 'hard', 'her', 'here', 'in', 'is', 'life', 'makes',
       'people', 'the', 'to', 'very', 'work', 'working'], dtype=object)

In [11]:
from nltk.stem import PorterStemmer

ps = PorterStemmer() ## defining stemmer
feature_names_stemmed = []

for idx, document in enumerate(corpus):
    print(idx, document)
#     print(ps.stem(document.split()))

0 Ashi is happy to work in Gurgoan. Work brings happiness to Ashi.
1 Happy people work better in Gurgoan. Ashi enjoys the work culture here.
2 The work environment in her company makes Ashi very happy.
3 Happiness comes from working hard and work life balance.
