## Word Frequency Distribution

In [None]:
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

def word_frequency(text):
    words = word_tokenize(text.lower())
    return Counter(words)

text = "NLP is amazing. NLP makes machines understand text."
print("Word Frequencies:", word_frequency(text))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Word Frequencies: Counter({'nlp': 2, '.': 2, 'is': 1, 'amazing': 1, 'makes': 1, 'machines': 1, 'understand': 1, 'text': 1})


## BOW

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    "NLP is fun and exciting",
    "Machines understand NLP and text",
    "Text processing is part of NLP"
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

print("Feature Names:", vectorizer.get_feature_names_out())
print("BoW Representation:")
print(X.toarray())

Feature Names: ['and' 'exciting' 'fun' 'is' 'machines' 'nlp' 'of' 'part' 'processing'
 'text' 'understand']
BoW Representation:
[[1 1 1 1 0 1 0 0 0 0 0]
 [1 0 0 0 1 1 0 0 0 1 1]
 [0 0 0 1 0 1 1 1 1 1 0]]


## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

print("Feature Names:", vectorizer.get_feature_names_out())
print("TF-IDF Representation:")
print(X.toarray())

Feature Names: ['and' 'exciting' 'fun' 'is' 'machines' 'nlp' 'of' 'part' 'processing'
 'text' 'understand']
TF-IDF Representation:
[[0.40619178 0.53409337 0.53409337 0.40619178 0.         0.31544415
  0.         0.         0.         0.         0.        ]
 [0.40619178 0.         0.         0.         0.53409337 0.31544415
  0.         0.         0.         0.40619178 0.53409337]
 [0.         0.         0.         0.35829137 0.         0.27824521
  0.4711101  0.4711101  0.4711101  0.35829137 0.        ]]


## NLP Task - Keyword Extraction

In [None]:
import numpy as np

feature_array = np.array(vectorizer.get_feature_names_out())
importance = np.argsort(X.toarray()).flatten()[::-1]

keywords = feature_array[importance[:5]]
print("Top Keywords:", keywords)

Top Keywords: ['processing' 'of' 'part' 'is' 'text']
