# ***Embeddings - JSON Data*** 

## ***1. Importing Libraries***

In [1]:
import numpy as np
import json
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models import KeyedVectors

## ***2. Loading the Data*** 

In [2]:
with open('data/entries.json', 'r') as f:
    data = json.load(f)

## ***3. Extracting Text from Data*** 

In [3]:
def extract_text(json_data):
    text = ""  

    # Recursive function to traverse the JSON data
    def traverse_json(node):
        nonlocal text
        if isinstance(node, dict):
            for key, value in node.items():
                if key in ["type", "field", "value"]:
                    text += str(value) + " "
                traverse_json(value)
        elif isinstance(node, list):
            for item in node:
                traverse_json(item)

    traverse_json(json_data)
    return text.strip()

## ***4. Preprocess the Data*** 

In [4]:
def preprocess_text(text):
    return simple_preprocess(text)

## ***5. Generate Embeddings*** 

In [5]:
def generate_embeddings(text, embedding_model):
    # Assuming `embedding_model` is a pre-trained Word2Vec model
    embeddings = []
    for word in text:
        if word in embedding_model:
            embeddings.append(embedding_model[word])
    return embeddings

## ***6. Loading the Pre - Trained Model*** 

#### ***Model Used : GoogleNews-vectors-negative-300 (Open Source Model)***

*The GoogleNews-vectors-negative-300 model, employing Word2Vec on Google News data, creates word embeddings by predicting words based on context. It captures semantic relationships, representing words as vectors in a continuous space. These embeddings, facilitating tasks like translation and sentiment analysis, imbue natural language processing with nuanced understanding. They enable algorithms to interpret linguistic nuances, improve language-related tasks, and enhance various applications requiring semantic comprehension.*

In [6]:
word2vec_model = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

## ***7. Results*** 

In [7]:
# Extract text from the JSON data
text = extract_text(data)

# Preprocess text
preprocessed_text = preprocess_text(text)

# Generate embeddings
embeddings = generate_embeddings(preprocessed_text, word2vec_model)
print("Embeddings:", embeddings)  # Print embeddings

Embeddings: 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



#### ***Jupyter's I/O limit exceeded, so the embeddings cannot be printed. So, the first 5 embeddings are alone printed***

### ***i) Total No. of Embeddings***

In [8]:
print("Number of Embeddings:", len(embeddings)) 

Number of Embeddings: 3701


### ***ii) Displaying Subset of Embeddings*** 

In [9]:
num_embeddings_to_print = min(5, len(embeddings))  # Print at most 5 embeddings
print("Subset of Embeddings:")
for i in range(num_embeddings_to_print):
    print(f"Embedding {i + 1}: {embeddings[i]}")

Subset of Embeddings:
Embedding 1: [-1.83593750e-01  2.07031250e-01 -1.72851562e-01 -7.61718750e-02
 -4.71191406e-02  1.11816406e-01  1.77734375e-01  7.08007812e-02
  6.12792969e-02 -1.68945312e-01 -9.27734375e-02  1.70898438e-02
  5.22613525e-04  1.59179688e-01 -2.68554688e-02  1.49414062e-01
  2.79541016e-02  1.24023438e-01  5.07812500e-02 -8.39843750e-02
 -1.41601562e-02 -4.54101562e-02 -2.87109375e-01  1.43554688e-01
  7.42187500e-02  1.93359375e-01 -2.69531250e-01 -2.23632812e-01
  2.14843750e-01 -1.98242188e-01  3.05175781e-02  3.61328125e-01
  8.93554688e-02  8.39843750e-02  7.95898438e-02 -3.84521484e-03
 -1.37695312e-01  3.61328125e-02 -1.34765625e-01  1.73828125e-01
  9.61914062e-02  1.49414062e-01  7.17773438e-02 -6.03027344e-02
  1.73339844e-02 -2.28515625e-01 -8.83789062e-02  8.83789062e-02
  1.04003906e-01  1.03027344e-01 -1.09863281e-01 -1.85546875e-01
 -1.81640625e-01 -1.41601562e-01  6.29882812e-02 -1.08642578e-02
  6.17675781e-02 -1.24023438e-01  1.58203125e-01 -1.080