In [None]:
!pip install spacy gensim
from spacy.cli import download

download("en_core_web_sm")

/media/scientist-anand/volume/mr_document/all_venv/chatterbox_venv/bin/python: No module named spacy


In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
text = "Natural Language Processing helps computers understand human language."
doc = nlp(text)
print(doc)

Natural Language Processing helps computers understand human language.


In [10]:
clean_data = []
for token in doc:
    if token.is_alpha and not token.is_stop:
        print(token)
        print("lemma==>", token.lemma_.lower())
        clean_data.append(token.lemma_.lower())

print(clean_data)

Natural
lemma==> natural
Language
lemma==> language
Processing
lemma==> processing
helps
lemma==> help
computers
lemma==> computer
understand
lemma==> understand
human
lemma==> human
language
lemma==> language
['natural', 'language', 'processing', 'help', 'computer', 'understand', 'human', 'language']


In [None]:
# Very simple representation

# Loses order/semantic meaning → so weak but baseline


from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    "Natural language processing makes machines understand text",
    "Machines learn language using models",
]
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(bow.toarray())

['language' 'learn' 'machines' 'makes' 'models' 'natural' 'processing'
 'text' 'understand' 'using']
[[1 0 1 1 0 1 1 1 1 0]
 [1 1 1 0 1 0 0 0 0 1]]


In [None]:
# ✔ Why TF–IDF?

# Better than BoW
# Reduces weight of common words
# Still no semantic meaning

from sklearn.feature_extraction.text import TfidfVectorizer
tfvectorizer = TfidfVectorizer()
tf_idf = tfvectorizer.fit_transform(corpus)
print(tfvectorizer.get_feature_names_out())
print(tf_idf.toarray())


['language' 'learn' 'machines' 'makes' 'models' 'natural' 'processing'
 'text' 'understand' 'using']
[[0.29017021 0.         0.29017021 0.4078241  0.         0.4078241
  0.4078241  0.4078241  0.4078241  0.        ]
 [0.35520009 0.49922133 0.35520009 0.         0.49922133 0.
  0.         0.         0.         0.49922133]]


In [None]:
# spaCy Built-in Word Embeddings
# spaCy models embed each token into a dense vector.
# Already pre-trained
# Fast, good for similarity tasks

for token in doc:
    print(token.text, token.vector)

Natural [ 0.09233066 -1.4107585   1.2357942   0.8190626  -0.6822275  -0.52685237
  0.24148856  1.1131383  -0.69569486 -1.3829342   0.5967282   0.5573212
 -0.17091261 -0.18343106 -0.76795894  0.2000229  -0.7856187  -0.05777684
 -0.07475362 -1.1156871  -0.8612213   0.57850575 -0.45532492  0.79654884
  0.01946539 -0.53069335 -0.06473133  0.5699377   0.677717    1.318459
 -0.00259918 -0.18492317  0.08142059 -0.66779655 -0.78136814  0.3002373
 -0.7488482   0.04717466 -0.39417383 -0.00236714 -0.07755528 -0.14904977
 -0.52217436  0.9544612   0.40662533  1.1361792  -0.31644675 -0.53252393
 -0.10778171 -0.02959064 -0.12958586  1.2995785   0.66378194 -0.5272245
 -0.32479405  0.95076203 -0.10416352  0.15172255 -0.6867532   0.8802191
 -0.99727404 -0.16225106 -0.15909408 -1.0744048  -0.79363453 -0.90394163
  0.99174035  1.707526    0.11501172 -0.30774158 -0.14780864  0.09372749
  0.6520758  -0.8216307   0.54177415  0.12289012 -0.58877665 -0.69574475
  0.78675866 -0.28117856 -1.3316567   0.11586311 

In [21]:
# Train Word2Vec From Scratch (gensim)
# Word2Vec learns semantic meaning through context.
from gensim.models import Word2Vec
sentences = [
    ["natural", "language", "processing", "helps", "computers"],
    ["computers", "learn", "language", "models"],
]
w2v_model = Word2Vec(
    sentences,
    vector_size=50,
    window=3,
    min_count=1,
    workers=4,
    sg=1
)
print(w2v_model.wv)

KeyedVectors<vector_size=50, 7 keys>


In [32]:
# Produces semantic vectors
# Words with similar meaning have similar vectors
# You can compute analogies, similarities

print(w2v_model.wv['language'])


[-0.01631583  0.0089916  -0.00827415  0.00164907  0.01699724 -0.00892435
  0.009035   -0.01357392 -0.00709698  0.01879702 -0.00315531  0.00064274
 -0.00828126 -0.01536538 -0.00301602  0.00493959 -0.00177605  0.01106732
 -0.00548595  0.00452013  0.01091159  0.01669191 -0.00290748 -0.01841629
  0.0087411   0.00114357  0.01488382 -0.00162657 -0.00527683 -0.01750602
 -0.00171311  0.00565313  0.01080286  0.01410531 -0.01140624  0.00371764
  0.01217773 -0.0095961  -0.00621452  0.01359526  0.00326295  0.00037983
  0.00694727  0.00043555  0.01923765  0.01012121 -0.01783478 -0.01408312
  0.00180291  0.01278507]


In [36]:
# Train FastText From Scratch
# FastText uses character n-grams → handles OOV words.
from gensim.models import FastText
fasttext_model = FastText(
    sentences,
    vector_size=50,
    window=3,
    min_count=1,
    workers=4
)

print(fasttext_model.wv["language"][:5])
print(fasttext_model.wv["langauge"][:5])  # misspelled word

[-0.0046083   0.00370444  0.00056316 -0.00397093 -0.0032009 ]
[-1.5339542e-03 -8.6634536e-04  1.1936483e-03  6.0992584e-06
  1.3281086e-03]


In [37]:
print("Word2Vec similarity:", w2v_model.wv.similarity("language", "computers"))
print("FastText similarity:", fasttext_model.wv.similarity("language", "computers"))

Word2Vec similarity: 0.042373005
FastText similarity: -0.040806167
