# NLP Lecture -05: Word2Vec

## Assignment-05:
Use the following dataset:- https://www.kaggle.com/datasets/divyansh22/friends-tv-show-script

- Take Friends show script dataset
- Apply Word2Vec over that

## 1. Using In-built library

In [None]:
# Installing wget library
!pip install gensim
!pip install wget

In [None]:
# Importing library
import gensim
from gensim.models import Word2Vec, KeyedVectors

In [None]:
--------------xxx-------------------xxx------------check-------------------xxx

In [None]:
import wget
from urllib.error import HTTPError

url = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'
try:
    filename = wget.download(url)
except HTTPError as e:
    print(f"Error downloading file: {e}")

In [None]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')
vec_king = wv['king']

In [None]:
# import wget
# url = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'
# filename = wget.download(url)

# f_in = gzip.open('GoogleNews-vectors-negative300.bin.gz', 'rb')
# f_out = open('GoogleNews-vectors-negative300.bin', 'wb')
# f_out.writelines(f_in)

# model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, limit=100000)

In [None]:
# Loading Wget (Pre-trained model)
import wget
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [None]:
# Creating model object

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz',binary=True)

In [None]:
--------------xxx--------------------xxx---------------------------xxx

In [None]:
# Let's see the vector representation of some words
model['man']
model['cricket']

# Task: Try to give color encoding for each feature.

In [None]:
# Finding most similar words
model.most_similar('man')
model.most_similar('cricket')
model.most_similar('facebook')

In [None]:
# Getting similarity between two words
model.similarity('man','woman')
model.similarity('man','PHP')

In [None]:
# To find Odd One Out
model.doesnt_match(['PHP','java','monkey'])

In [None]:
# Vector arithematic
vec = model['king'] - model['man'] + model['woman']
model.most_similar([vec])

In [None]:
vec = model['INR'] - model ['India'] + model['England']
model.most_similar([vec])

## 2. Training our own Word2Vec model:
- On Game_of_Thrones Story
- Dataset Link: https://www.kaggle.com/datasets/khulasasndh/game-of-thrones-books/

In [None]:
# Importing library
import numpy as np  
import pandas as pd 
import os

In [None]:
# Additional import
import gensim
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec, KeyedVectors

In [None]:
# Reading file and its contents

Game_of_thrones = []                                   # Storing all data in a corpus
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        file_path = os.path.join(dirname, filename)
        
        with open(file_path, 'r', encoding='latin-1') as file:
            contents = file.read()
            raw_sent = sent_tokenize(contents)
            for sent in raw_sent:
                Game_of_thrones.append(simple_preprocess(sent))

In [None]:
# Getting total no of sentences in the story
len(Game_of_thrones)

In [None]:
for i in Game_of_thrones[:5]:
    print(i)

In [None]:
# Creating our own model

model = gensim.models.Word2Vec(window = 10, vector_size=100, min_count = 2, workers = 3)

# Other parameters
# gensim.models.Word2Vec(sentences=None,corpus_file=None,vector_size=100,alpha=0.025,window=5,min_count=5,max_vocab_size=None,
#                        sample=0.001,seed=1,workers=3,min_alpha=0.0001,sg=0,hs=0,negative=5,ns_exponent=0.75,cbow_mean=1,
#                        hashfxn=<built-in function hash>,epochs=5,null_word=0,trim_rule=None,sorted_vocab=1,batch_words=10000,
#                        compute_loss=False,callbacks=(),comment=None,max_final_vocab=None,shrink_windows=True,)

In [None]:
# Building vocabulary using unique words
model.build_vocab(Game_of_thrones)

In [None]:
# Training of our model
model.train(Game_of_thrones, total_examples=model.corpus_count, epochs=model.epochs)

len(model.wv.index_to_key)

In [None]:
# # Defining function
# def document_vector(doc):
#     # Remove out-of-vocabulary words
#     doc = [word for word in doc.split() if word in model.wv.index_to_key]
#     return np.mean(model.wv[doc], axis=0)

# # Checking 1st review in vector with its dimension
# document_vector(df['review'].values[0])
# len(document_vector(df['review'].values[0]))

In [None]:
# Accessing words
model.wv.similarity('arya','sansa')
model.wv.similarity('bran','sansa')
model.wv.similarity('tywin','sansa')

In [None]:
# Accessing words
model.wv.most_similar('history')

In [None]:
# Finding odd one out
model.wv.doesnt_match(['jon','rokon','robb','arya','sansa','bran'])
model.wv.doesnt_match(['cersel','jaime','broom','tyrion'])

In [None]:
# Getting vector representation of a word
model.wv['arya']

In [None]:
# Getting vector representation of all the words
model.wv.get_normed_vectors()
model.wv.get_normed_vectors().shape

# shape: (17453, 100) = (unique_words, dimension)

In [None]:
# Finding words respective of their vectors
x = model.wv.index_to_key
len(x)

In [None]:
# Performing PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=3)

In [None]:
y = pca.fit_transform(model.wv.get_normed_vectors())
y.shape

In [None]:
import plotly.express as px

fig = px.scatter_3d(y[:100], x=1, y=0, z=2, color=x[:100])   # Plotting first 100 words
fig.show()