In [None]:
import gensim.downloader as api
from scipy.spatial.distance import cosine

# Load a pre-trained Word2Vec model
model = api.load("word2vec-google-news-300")

# Function to get the embedding of a word
def get_word_embedding(word):
    return model[word]

# Get embeddings for the word 'bank'
embedding = get_word_embedding('bank')

# Now, let's consider two different contexts
sentence1 = "I sat by the river bank."
sentence2 = "I deposited money in the bank."

# As Word2Vec doesn't consider context, the embeddings for 'bank' in both sentences will be the same
embedding_from_sentence1 = get_word_embedding('bank')
embedding_from_sentence2 = get_word_embedding('bank')

# Calculate cosine similarity to demonstrate they are identical
similarity = 1 - cosine(embedding_from_sentence1, embedding_from_sentence2)

print(f"Cosine similarity between the embeddings: {similarity}")

# This should print 1.0 since the embeddings are identical regardless of context


Cosine similarity between the embeddings: 1


In [None]:
print("Embedding for the word 'bank':", embedding)


In [None]:
print (type(embedding))
print (len(embedding))

<class 'numpy.ndarray'>
300


In [None]:
print(embedding.dtype)

In [None]:
pip install transformers



In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Function to get the embedding of a word from a sentence
def get_word_embedding(sentence, word):
    inputs = tokenizer(sentence, return_tensors="pt")
    outputs = model(**inputs)
    word_id = tokenizer.convert_tokens_to_ids(word)
    word_position = inputs["input_ids"][0].tolist().index(word_id)
    return outputs["last_hidden_state"][0][word_position].detach().numpy()

# Compare embeddings for the word 'bank' in two different contexts
sentence1 = "I sat by the river bank."
sentence2 = "I deposited money in the bank."

embedding1 = get_word_embedding(sentence1, "bank")
embedding2 = get_word_embedding(sentence2, "bank")

# Calculate cosine similarity or any other metric to see the difference
# For simplicity, let's use dot product
similarity = torch.nn.functional.cosine_similarity(
    torch.tensor(embedding1).unsqueeze(0), torch.tensor(embedding2).unsqueeze(0)
)

print(f"Cosine similarity between the embeddings: {similarity.item()}")



Cosine similarity between the embeddings: 0.5257285833358765


In [None]:
print (embedding1)

[ 3.79149020e-01 -5.13639987e-01  2.04258025e-01 -3.35932910e-01
 -3.92088473e-01  2.65460938e-01  1.79102048e-01  1.28376985e+00
 -1.52784362e-01 -5.30052245e-01  7.23883212e-01  2.17351019e-01
  2.28066310e-01  2.17148572e-01 -2.46859416e-01  3.17278415e-01
  6.68315589e-02 -8.06532055e-02  9.75921810e-01 -5.21805622e-02
  6.47305787e-01  3.78125846e-01  9.39771608e-02  1.27783582e-01
  5.06222308e-01  2.62110591e-01  7.34253585e-01 -4.33873355e-01
 -1.36379763e-01  2.23566309e-01  1.29379857e+00  4.36596423e-01
 -1.51691899e-01  1.30738884e-01 -3.35072577e-01  7.58766904e-02
  1.21622741e-01 -5.28269827e-01 -3.32096636e-01  6.20918632e-01
 -5.65218687e-01 -8.92570496e-01 -5.35947382e-01  8.75462294e-01
  4.00355071e-01 -2.04721138e-01 -2.30198011e-01 -1.49468750e-01
 -4.66228962e-01  3.58305246e-01 -4.75095838e-01  8.03174794e-01
 -3.76682669e-01 -5.61426818e-01  1.93526015e-01  8.07532191e-01
 -5.56862235e-01 -9.54556525e-01 -2.49630630e-01  1.68071002e-01
  6.89454019e-01  5.88459

In [None]:
print(len(embedding1))
print(embedding1.dtype)

768
float32


**Sentiment Analysis Using BERT**

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax


In [None]:
model_name = 'nlptown/bert-base-multilingual-uncased-sentiment' # This model is trained for sentiment analysis and gives ratings from 1 to 5.
model = BertForSequenceClassification.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)


Downloading (…)lve/main/config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
def predict_sentiment(text):
    # Tokenize input text and get predictions
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad(): # we don't need gradients for this
        outputs = model(**inputs)
        predictions = outputs[0]

    # Convert predictions to probabilities and get the argmax
    probs = softmax(predictions, dim=1)
    sentiment = torch.argmax(probs)

    return sentiment.item()


In [None]:
text = "The nature is so beautiful."
sentiment = predict_sentiment(text)
print(f"Sentiment rating: {sentiment}/5")  # Ratings are from 1 to 5

Sentiment rating: 4/5
