# Syntactic Vs Semantic NLP

Comparing BoW , TF-IDF , Word2Vec

In [1]:
import spacy
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
docs = [
    "The cat sat on the mat.",
    "The dog lay on the mat.",
    "The cat and dog played together."
]

In [3]:
# Bag of Words

bow_vectorizer = CountVectorizer()

bow_matrix = bow_vectorizer.fit_transform(docs)

bow_df = pd.DataFrame(bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out())

bow_df

Unnamed: 0,and,cat,dog,lay,mat,on,played,sat,the,together
0,0,1,0,0,1,1,0,1,2,0
1,0,0,1,1,1,1,0,0,2,0
2,1,1,1,0,0,0,1,0,1,1


In [4]:
# TF-IDF

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(docs)

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

tfidf_df

Unnamed: 0,and,cat,dog,lay,mat,on,played,sat,the,together
0,0.0,0.374207,0.0,0.0,0.374207,0.374207,0.0,0.492038,0.581211,0.0
1,0.0,0.0,0.374207,0.492038,0.374207,0.374207,0.0,0.0,0.581211,0.0
2,0.47111,0.358291,0.358291,0.0,0.0,0.0,0.47111,0.0,0.278245,0.47111


In [6]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     --- ------------------------------------ 1.0/12.8 MB 3.1 MB/s eta 0:00:04
     ----- ---------------------------------- 1.8/12.8 MB 4.0 MB/s eta 0:00:03
     --------- ------------------------------ 3.1/12.8 MB 4.3 MB/s eta 0:00:03
     ------------- -------------------------- 4.2/12.8 MB 4.7 MB/s eta 0:00:02
     ------------------ --------------------- 5.8/12.8 MB 5.0 MB/s eta 0:00:02
     -------------------- ------------------- 6.6/12.8 MB 5.1 MB/s eta 0:00:02
     ------------------------- -------------- 8.1/12.8 MB 5.2 MB/s eta 0:00:01
     --------------------------- ------------ 8.9/12.8 MB 5.3 MB/s eta 0:00:01
     ------------------------------- -------- 10

In [7]:
# Word2Vec using spacy

nlp = spacy.load('en_core_web_sm')

def document_vector(doc):

    return nlp(doc).vector

word2vec_matrix = np.array([document_vector(doc) for doc in docs])

print("Shape of word2vec matrix : ", word2vec_matrix.shape)

word2vec_df = pd.DataFrame(word2vec_matrix)

word2vec_df

Shape of word2vec matrix :  (3, 96)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,0.324354,-0.359387,0.05284,-0.15793,-0.24777,-0.240909,0.333646,-0.235756,0.047481,0.33926,...,0.520471,-0.308991,-0.049926,0.417244,-0.44304,-0.126179,0.327501,-0.271217,0.131557,0.095498
1,0.350598,-0.568526,-0.012747,-0.149368,-0.373014,-0.210992,0.20808,-0.102724,0.021061,0.192631,...,0.528918,-0.210021,-0.144755,0.404209,-0.475391,-0.057115,0.385845,-0.195486,-0.034276,0.108231
2,0.220732,-0.367771,-0.261826,-0.285697,-0.056045,0.180542,0.291718,0.339174,0.051279,0.273173,...,-0.182754,0.288142,-0.259799,0.334715,-0.597405,0.036915,0.234475,0.311296,0.226123,-0.060963


In [8]:
# Document similirity using word2vec

print(
    pd.DataFrame(
        cosine_similarity(word2vec_matrix),
        columns=[f'Doc {i+1}' for i in range(len(docs))],
        index=[f'Doc {i+1}' for i in range(len(docs))]
    )
)

          Doc 1     Doc 2     Doc 3
Doc 1  1.000000  0.937243  0.538770
Doc 2  0.937243  1.000000  0.552911
Doc 3  0.538770  0.552911  1.000000


In [9]:
# Word Similirity Cat VS Dog

# Using BoW

bow_vocab = bow_vectorizer.vocabulary_

cat_bow = np.zeros(len(bow_vocab))

dog_bow = np.zeros(len(bow_vocab))

if "cat" in bow_vocab:
    cat_bow[bow_vocab["cat"]] = 1

if "dog" in bow_vocab:
    dog_bow[bow_vocab["dog"]] = 1

sim_bow = cosine_similarity([cat_bow], [dog_bow])[0][0]

print("Cosine Similarity using BoW : ", sim_bow)

Cosine Similarity using BoW :  0.0


In [10]:
# Using TF-IDF

tfidf_vocab = tfidf_vectorizer.vocabulary_

cat_tfidf = np.zeros(len(tfidf_vocab))

dog_tfidf = np.zeros(len(tfidf_vocab))

if "cat" in tfidf_vocab:
    cat_tfidf[tfidf_vocab["cat"]] = 1

if "dog" in tfidf_vocab:
    dog_tfidf[tfidf_vocab["dog"]] = 1

sim_tfidf = cosine_similarity([cat_tfidf], [dog_tfidf])[0][0]

print("Cosine Similarity using TF-IDF : ", sim_tfidf)

Cosine Similarity using TF-IDF :  0.0


In [11]:
# Using word2vec

cat_vec = nlp("cat").vector

dog_vec = nlp("dog").vector

sim_word2vec = cosine_similarity([cat_vec], [dog_vec])[0][0]

print("Cosine Similarity using word2vec : ", sim_word2vec)

Cosine Similarity using word2vec :  0.74227285


In [12]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     ---------------------------------------- 0.8/400.7 MB 4.8 MB/s eta 0:01:24
     ---------------------------------------- 1.8/400.7 MB 4.8 MB/s eta 0:01:24
     ---------------------------------------- 2.6/400.7 MB 4.3 MB/s eta 0:01:33
     ---------------------------------------- 3.4/400.7 MB 4.2 MB/s eta 0:01:35
     ---------------------------------------- 3.9/400.7 MB 3.9 MB/s eta 0:01:42
     ---------------------------------------- 4.2/400.7 MB 3.5 MB/s eta 0:01:52
     ---------------------------------------- 4.7/400.7 MB 3.4 MB/s eta 0:01:56
      --------------------------------------- 5.2/400.7 MB 3.4 MB/s eta 0:01:58
      --------------------------------------- 6.0/400.7 MB 3.3 MB/s eta 0:02:01
      ------------------------

In [13]:
# Find vector using Word2vec for each word from the pretrained model

import spacy

nlp = spacy.load("en_core_web_lg")

sentence_1 = "He deposited cash in bank."
sentence_2 = "She sat by the river bank."

doc1 = nlp(sentence_1)
doc2 = nlp(sentence_2)

print("Word vectors for sentence 1 : ")

for token in doc1:

    print(f"{token.text} : {token.vector}")


print("\nWord vectors for sentence 2 : ")

for token in doc2:

    print(f"{token.text} : {token.vector}")

Word vectors for sentence 1 : 
He : [ 8.5181e-02  5.0892e-01 -8.8280e-02 -3.9785e-01  2.5251e-01  5.7932e-02
 -1.5804e-01 -4.0127e-01 -1.3023e-01  3.9609e+00 -1.5790e-01  4.0404e-01
 -2.9674e-01 -1.7273e-01 -4.2699e-01  2.0168e-01  1.6489e-01 -2.7982e-02
  7.6168e-02  4.9308e-02  3.2306e-01  4.5925e-02 -9.0130e-02 -9.4453e-02
  4.2531e-02 -1.7858e-01  4.3216e-02 -1.3998e-01 -6.1144e-02 -4.0905e-01
 -6.0096e-01  1.0798e-01 -1.7421e-01 -6.9579e-02 -1.2241e-01 -2.9401e-01
 -2.1665e-01 -1.5557e-01 -1.9010e-01  1.2889e-01  1.2175e-01 -6.3826e-02
  4.1555e-01 -2.6567e-01  2.6044e-01  1.1329e-01 -2.5144e-01 -1.5775e-01
 -1.5733e-01  1.4483e-01  5.5324e-02  6.6655e-03 -9.1495e-02  1.2651e-01
  2.6622e-02  2.1174e-01  4.7914e-02  1.6410e-01  2.4683e-01 -4.7167e-01
 -2.3645e-01 -1.2938e-02  1.7560e-01  4.0160e-01 -1.7476e-01 -1.3261e-01
  1.2886e-01 -3.4884e-02 -1.4946e-02  2.8423e-01 -2.1608e-01 -1.9089e-01
  2.9524e-02 -3.1676e-01 -6.7708e-01 -1.1262e-01  3.4284e-01 -3.0276e-01
 -1.4569e-01  2

In [14]:
# Using Bert

# Get BERT embeddings using Hugging Face
# Pip install transformers torch

In [15]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())  # Should return True if CUDA is working

2.7.1+cpu
False


In [16]:
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

model = AutoModel.from_pretrained('bert-base-uncased')

sentences = ["He deposited cash in the bank.",
             "She sat by the river bank."]

bank_vector = []

for sent in sentences:

    inputs = tokenizer(sent, return_tensors = "pt")

    with torch.no_grad():

        outputs = model(**inputs)

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    try:

        bank_idx = tokens.index("bank")

    except ValueError:

        raise Exception(f"'bank' not found in tokens: {token}")

    bank_vec = outputs.last_hidden_state[0][bank_idx].numpy()

    bank_vector.append(bank_vec)

print("Bank (finance) vector : ", bank_vector[0])

print("Bank (river) vector : ", bank_vector[1])


Bank (finance) vector :  [ 3.38825822e-01 -4.81747299e-01 -2.08177626e-01  1.66002646e-01
  9.79713976e-01  1.74404293e-01 -5.12154520e-01  7.76752055e-01
 -9.10247937e-02 -1.84646606e-01  4.41605687e-01 -2.76178658e-01
 -3.61095846e-01  1.60314694e-01 -6.13402069e-01 -2.81178504e-01
  3.98036867e-01  1.37110770e-01  1.14339435e+00  4.41531837e-02
 -5.98254025e-01  4.72648554e-02  4.52181906e-01  1.83068931e-01
  1.11062370e-01  4.82495010e-01  1.09142713e-01  3.67950886e-01
 -4.16943729e-01 -3.93802643e-01  6.22381032e-01  8.63099217e-01
  2.09493786e-01  1.76882088e-01  7.98804909e-02 -1.34072587e-01
  1.28842413e-01 -1.37331277e-01 -1.42057133e+00 -7.58751631e-02
 -1.40125379e-01 -7.00906277e-01 -4.63446826e-01  2.35611230e-01
 -1.31731942e-01 -4.82020438e-01  4.42483008e-01  2.38655716e-01
 -6.54097974e-01 -4.76936638e-01 -3.52407873e-01  6.82572424e-01
  3.14005464e-01 -4.13273662e-01  9.06799808e-02  6.28960550e-01
 -8.22424650e-01 -5.72398603e-01 -6.71776175e-01  3.46927941e-02


In [17]:
# Word2vec part using spaCy

import spacy
from transformers import AutoTokenizer, AutoModel
import torch

print("Word2vec (spaCy) Static vector : ")

nlp = spacy.load("en_core_web_lg")

sentences = ["He deposited cash in the bank.",
             "She sat by the river bank."]

bank_finance_vec = nlp(sentences[0]).vector

bank_river_vec = nlp(sentences[1]).vector

print("bank vector (finance context) : ", bank_finance_vec)

print("bank vector (river context) : ", bank_river_vec)

print("Cosine Similarity (finance Vs river) : ",
      np.dot(bank_finance_vec, bank_river_vec) /
      (np.linalg.norm(bank_finance_vec) * np.linalg.norm(bank_river_vec)))


Word2vec (spaCy) Static vector : 
bank vector (finance context) :  [-1.58615991e-01  1.14979997e-01 -4.24642898e-02 -1.42722428e-01
  1.10434003e-01  6.20458908e-02 -2.76157141e-01 -2.02987149e-01
  1.63861424e-01  2.51839995e+00 -4.53351438e-01  3.40545863e-01
  1.94364578e-01 -9.94372927e-03 -1.67635709e-01  9.79028493e-02
  3.47567126e-02  1.15213108e+00  9.12164301e-02  6.30144328e-02
 -1.70335695e-02  4.35558101e-03 -1.89931855e-01  7.24638626e-02
  4.38669734e-02 -8.12228695e-02 -1.08481847e-01  1.03053145e-01
 -2.09487557e-01 -7.71319792e-02 -3.06118336e-02 -2.00236008e-01
  2.75484286e-02  1.23468712e-01 -6.62985817e-02 -2.46678554e-02
 -8.42538550e-02  1.05736338e-01  5.07591590e-02 -4.03182879e-02
  1.30418539e-02  3.56870890e-03  3.75741422e-01 -3.34944278e-01
  8.76632854e-02  1.52679875e-01  6.88785762e-02 -2.41802722e-01
 -3.34836878e-02 -1.83128603e-02  6.32295758e-02 -1.78576410e-02
 -2.45982155e-01  8.89191478e-02  5.78984134e-02 -2.49181427e-02
 -3.71184275e-02 -1.064

In [18]:
# BERT part

print("BERT contextural vectors : ")

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

model = AutoModel.from_pretrained('bert-base-uncased')

bert_bank_vector = []

for sent in sentences:

    inputs = tokenizer(sent, return_tensors = "pt")

    with torch.no_grad():

        outputs = model(**inputs)

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    try:

        bank_idx = tokens.index("bank")

    except ValueError:

        raise Exception(f"'bank' not found in tokens: {token}")

    bank_vec = outputs.last_hidden_state[0][bank_idx].numpy()

    bert_bank_vector.append(bank_vec)

print("Bank (finance) vector : ", bert_bank_vector[0])

print("Bank (river) vector : ", bert_bank_vector[1])

print("Cosine Similarity (finance Vs river) : ",
      np.dot(bert_bank_vector[0], bert_bank_vector[1]) /
      (np.linalg.norm(bert_bank_vector[0]) * np.linalg.norm(bert_bank_vector[1])))

BERT contextural vectors : 
Bank (finance) vector :  [ 3.38825822e-01 -4.81747299e-01 -2.08177626e-01  1.66002646e-01
  9.79713976e-01  1.74404293e-01 -5.12154520e-01  7.76752055e-01
 -9.10247937e-02 -1.84646606e-01  4.41605687e-01 -2.76178658e-01
 -3.61095846e-01  1.60314694e-01 -6.13402069e-01 -2.81178504e-01
  3.98036867e-01  1.37110770e-01  1.14339435e+00  4.41531837e-02
 -5.98254025e-01  4.72648554e-02  4.52181906e-01  1.83068931e-01
  1.11062370e-01  4.82495010e-01  1.09142713e-01  3.67950886e-01
 -4.16943729e-01 -3.93802643e-01  6.22381032e-01  8.63099217e-01
  2.09493786e-01  1.76882088e-01  7.98804909e-02 -1.34072587e-01
  1.28842413e-01 -1.37331277e-01 -1.42057133e+00 -7.58751631e-02
 -1.40125379e-01 -7.00906277e-01 -4.63446826e-01  2.35611230e-01
 -1.31731942e-01 -4.82020438e-01  4.42483008e-01  2.38655716e-01
 -6.54097974e-01 -4.76936638e-01 -3.52407873e-01  6.82572424e-01
  3.14005464e-01 -4.13273662e-01  9.06799808e-02  6.28960550e-01
 -8.22424650e-01 -5.72398603e-01 -6.7

In [19]:
# Cosine similarity using spacy word2vec

import spacy

nlp = spacy.load("en_core_web_lg")

word1 = nlp("friend")
word2 = nlp("enemy")

# word1 = nlp("I am your friend")
# word2 = nlp("I am your enemy")

# word1 = nlp("Enemy's enemy is friend")
# word2 = nlp("Friend's friend is not enemy")

vec1 = word1.vector
vec2 = word2.vector

print("Cosine Similarity : ", word1.similarity(word2))

Cosine Similarity :  0.2666360139846802


In [20]:
word1 = nlp("I am your friend")
word2 = nlp("I am your enemy")

vec1 = word1.vector
vec2 = word2.vector

print("Cosine Similarity : ", word1.similarity(word2))

Cosine Similarity :  0.9120745658874512


In [21]:
word1 = nlp("Enemy's enemy is friend")
word2 = nlp("Friend's friend is not enemy")

vec1 = word1.vector
vec2 = word2.vector

print("Cosine Similarity : ", word1.similarity(word2))

Cosine Similarity :  0.9241560101509094


In [22]:
# Cosine similarity using BERT

from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertModel.from_pretrained('bert-base-uncased')

word1 = "friend"
word2 = "enemy"

# word1 = "I am your friend"
# word2 = "I am your enemy"

# word1 = "Enemy's enemy is friend"
# word2 = "Friend's friend is not enemy"

inputs1 = tokenizer(word1, return_tensors = "pt")

inputs2 = tokenizer(word2, return_tensors = "pt")

with torch.no_grad():

    outputs1 = model(**inputs1)

    outputs2 = model(**inputs2)

vec1 = outputs1.last_hidden_state[0][1]

vec2 = outputs2.last_hidden_state[0][1]

similarity = cosine_similarity(vec1.unsqueeze(0).numpy(),
                               vec2.unsqueeze(0).numpy())[0][0]

print("Cosine similarity (BERT) : ", similarity)

Cosine similarity (BERT) :  0.6918429


In [23]:
!pip install tensorflow



In [24]:
# Training a CBOW model using keras and tensorflow

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Lambda, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import backend as K

In [25]:
text = """Neural networks power modern deep learning systems.
          Machine learning is a subset of artificial intelligence.
          Natural language processing allows computers to understand human language.
          Convolutional networks are widely used in Computer vision tasks.
          Tramsformers have changed the landscape of NLP and AI."""

In [26]:
text = text.lower().replace(".","")

tokenizer = Tokenizer()

tokenizer.fit_on_texts([text])

word_index = tokenizer.word_index

index_word = {v: k for k, v in word_index.items()}

vocab_size = len(word_index)

words = text.split()

window_size = 2

X = []
y = []

for i in range(window_size, len(words) - window_size):

    context = [words[j] for j in range(i - window_size, i + window_size + 1) if j != i]

    target = words[i]

    context_ids = [word_index[w] for w in context]

    target_id = word_index[target]

    X.append( context_ids)

    y.append(to_categorical(target_id - 1, num_classes = vocab_size))

X = np.array(X)
y = np.array(y)


embedding_dim = 10



In [27]:
X.shape, y.shape

((38, 4), (38, 38))

In [28]:
model = Sequential()

model.add(Embedding(input_dim = vocab_size+1,
                    output_dim = embedding_dim,
                    input_length = 2*window_size))

model.add(Lambda(lambda x: K.mean(x, axis = 1)))

model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer = Adam(learning_rate = 0.01))

model.fit(X, y, epochs = 100, verbose = 0)






<keras.src.callbacks.history.History at 0x1b190f7fd50>

In [29]:
print(X)
print(y)

[[ 5  1  7  8]
 [ 1  6  8  2]
 [ 6  7  2  9]
 [ 7  8  9 10]
 [ 8  2 10  2]
 [ 2  9  2 11]
 [ 9 10 11 12]
 [10  2 12 13]
 [ 2 11 13  3]
 [11 12  3 14]
 [12 13 14 15]
 [13  3 15 16]
 [ 3 14 16  4]
 [14 15  4 17]
 [15 16 17 18]
 [16  4 18 19]
 [ 4 17 19 20]
 [17 18 20 21]
 [18 19 21 22]
 [19 20 22  4]
 [20 21  4 23]
 [21 22 23  1]
 [22  4  1 24]
 [ 4 23 24 25]
 [23  1 25 26]
 [ 1 24 26 27]
 [24 25 27 28]
 [25 26 28 29]
 [26 27 29 30]
 [27 28 30 31]
 [28 29 31 32]
 [29 30 32 33]
 [30 31 33 34]
 [31 32 34 35]
 [32 33 35  3]
 [33 34  3 36]
 [34 35 36 37]
 [35  3 37 38]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]]


In [30]:
def predict_target_word(context_words):

    print("Input Context : ", context_words)

    try:

        context_ids = [word_index[w.lower()] for w in context_words]

    except KeyError:

        print(f"Context word not in vocabulary : {e}")

        return

    context_ids = np.array(context_ids).reshape(1, -1)

    prediction = model.predict(context_ids, verbose=0)[0]

    predicted_id = np.argmax(prediction)

    predicted_word = index_word[predicted_id]

    print("Predicted target word : ", predicted_word)

In [31]:
predict_target_word(["neural", "power","deep","learning"])

Input Context :  ['neural', 'power', 'deep', 'learning']
Predicted target word :  power


In [32]:
predict_target_word(["machine", "is","of","artificial"])

Input Context :  ['machine', 'is', 'of', 'artificial']
Predicted target word :  a


In [33]:
predict_target_word(["language", "power","deep","learning"])

Input Context :  ['language', 'power', 'deep', 'learning']
Predicted target word :  power


In [34]:
# Step 2

window_size = 2

sequences = []

for line in text.split("\n"):

    words = line.split()

    for i, word in enumerate(words):

        target_word = word_index.get(word)

        if target_word is None:

            continue

        context_words = [
            word_index.get(words[j]) for j in range(i - window_size, i + window_size + 1)
        if j != i and j >= 0 and j < len(words)
        ]

        for context_word in context_words:

            if context_word is not None:

                sequences.append([target_word, context_word])

sequences = np.array(sequences)

vocab_size = len(word_index) + 1

embedding_dim = 10

In [35]:
model = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=embedding_dim,
              input_length=1,
              name = "word_embedding"),

    Flatten(),

    Dense(vocab_size, activation="softmax")
])

model.compile(loss='categorical_crossentropy', optimizer = Adam(learning_rate = 0.001))

In [36]:
X = sequences[:,0]
y = to_categorical(sequences[:, 1], num_classes=vocab_size)

model.fit(X, y, epochs = 10, verbose = 2)

Epoch 1/10
5/5 - 0s - 72ms/step - loss: 3.6629
Epoch 2/10
5/5 - 0s - 7ms/step - loss: 3.6590
Epoch 3/10
5/5 - 0s - 7ms/step - loss: 3.6561
Epoch 4/10
5/5 - 0s - 7ms/step - loss: 3.6531
Epoch 5/10
5/5 - 0s - 7ms/step - loss: 3.6503
Epoch 6/10
5/5 - 0s - 6ms/step - loss: 3.6475
Epoch 7/10
5/5 - 0s - 7ms/step - loss: 3.6447
Epoch 8/10
5/5 - 0s - 7ms/step - loss: 3.6418
Epoch 9/10
5/5 - 0s - 7ms/step - loss: 3.6389
Epoch 10/10
5/5 - 0s - 7ms/step - loss: 3.6360


<keras.src.callbacks.history.History at 0x1b18d64ec90>

In [37]:
def get_context_words(target_word, model, word_index, window_size):

    target_idx = word_index.get(target_word)

    if target_idx is None:

        return []

    context_words = []

    for word, idx in word_index.items():

        if idx != target_idx:

            context_words.append(word)

    return context_words

#Test: Predict context words for a target word

target_word = 'processing'

print(f"""Target word: (target_word) => Context words:
        {get_context_words (target_word, model, word_index, window_size)}""")

Target word: (target_word) => Context words:
        ['networks', 'learning', 'of', 'language', 'neural', 'power', 'modern', 'deep', 'systems', 'machine', 'is', 'a', 'subset', 'artificial', 'intelligence', 'natural', 'allows', 'computers', 'to', 'understand', 'human', 'convolutional', 'are', 'widely', 'used', 'in', 'computer', 'vision', 'tasks', 'tramsformers', 'have', 'changed', 'the', 'landscape', 'nlp', 'and', 'ai']


In [38]:
!pip install vaderSentiment

Collecting vaderSentiment
  Using cached vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Using cached vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [39]:
# VADER

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()


sentences = [

    "I love this product! It's amazing.",

    "The weather is terrible today.",

    "I'm not sure if I like this movie.",

    "Overall, it was an average performance.",

    "Not too bad.",

    "I did somewhat ok in the test.",

    "I did ok in the test.",

    "Doing this will be TERRIBLE!!!",

    "Doing this will be terrible.",

    "Spacy is a great library for NLP." ]



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\kisha\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [40]:
for sentence in sentences:

    sentiment_scores = sid.polarity_scores(sentence)

    print(f"Sentence : {sentence}")

    print(f"Sentiment Score : {sentiment_scores}")

    if sentiment_scores['compound'] >= 0.05:

        print("Sentiment : Positive\n")

    elif sentiment_scores['compound'] <= -0.05:

        print("Sentiment : Negative\n")

    else:

        print("Sentiment : Neutral\n")



Sentence : I love this product! It's amazing.
Sentiment Score : {'neg': 0.0, 'neu': 0.266, 'pos': 0.734, 'compound': 0.8516}
Sentiment : Positive

Sentence : The weather is terrible today.
Sentiment Score : {'neg': 0.437, 'neu': 0.563, 'pos': 0.0, 'compound': -0.4767}
Sentiment : Negative

Sentence : I'm not sure if I like this movie.
Sentiment Score : {'neg': 0.449, 'neu': 0.551, 'pos': 0.0, 'compound': -0.4717}
Sentiment : Negative

Sentence : Overall, it was an average performance.
Sentiment Score : {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Sentiment : Neutral

Sentence : Not too bad.
Sentiment Score : {'neg': 0.0, 'neu': 0.412, 'pos': 0.588, 'compound': 0.431}
Sentiment : Positive

Sentence : I did somewhat ok in the test.
Sentiment Score : {'neg': 0.0, 'neu': 0.724, 'pos': 0.276, 'compound': 0.228}
Sentiment : Positive

Sentence : I did ok in the test.
Sentiment Score : {'neg': 0.0, 'neu': 0.645, 'pos': 0.355, 'compound': 0.296}
Sentiment : Positive

Sentence : Doing t