In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Embedding, Dot, Flatten, Input
from keras.models import Model
import numpy as np
import pandas as pd
from google.colab import files
import csv
from transformers import pipeline
from gensim.models import KeyedVectors

In [None]:
uploaded = files.upload()

Saving word_vectors.csv to word_vectors.csv


In [None]:
file_path = "/content/word_vectors (4).csv"


In [None]:
existing_vectors = {}
with open(file_path, 'r') as f:
    reader = csv.reader(f)
    next(reader, None)
    for row in reader:
        word = row[0]
        vector = list(map(float, row[1:]))
        existing_vectors[word] = vector

In [None]:
print(existing_vectors)

{'a': [-0.2733841, 0.2790021, 0.4245956, -0.41518036, -0.3309673, 0.23452602, 0.2712199, 0.18993817, 0.14520799, 0.30728018, 0.02408807, -0.10806114, 0.08950882, 0.15373066, 0.03151122, -0.029455438, -0.06451302, -0.004259415, -0.37409458, 0.32723752], 'about': [0.16557759, 0.33586347, 0.3136804, 0.074343905, 0.19548532, -0.0063670366, -0.23307136, 0.32695872, 0.22831121, 0.34889546, 0.23847836, -0.31397557, 0.20190758, -0.15291815, 0.059496723, 0.2784835, -0.2647325, 0.2904488, 0.2849893, -0.036167014], 'above': [0.28216794, 0.24578346, 0.35207635, -0.292255, 0.14559951, 0.0035475283, -0.22823995, 0.3058255, 0.32098946, 0.21896842, 0.23097683, -0.3049532, 0.21149442, -0.24232428, 0.023454938, 0.24659736, -0.103685886, 0.23764953, 0.32246116, 0.056603033], 'across': [0.08429302, 0.33226448, 0.36186197, -0.12147009, -0.17822048, -0.17191552, -0.5100895, 0.118918344, 0.36263686, 0.46164203, 0.40659523, -0.08113879, 0.3653414, 0.07696749, 0.34670225, 0.016465025, 0.048011173, 0.25255048, 

In [None]:
words = list(existing_vectors.keys())
vectors = list(existing_vectors.values())

In [None]:
def generate_vectors_within_radius(base_vector, radius=0.2, step_size=0.1, max_dim=10):
    if base_vector is None:
        raise ValueError("Base vector is None. Please provide a valid word to get its vector.")

    base_vector = np.array(base_vector)
    dimensions = min(len(base_vector), max_dim)

    perturbations = []
    for _ in range(50):
        perturbation = np.random.uniform(-radius, radius, dimensions)
        perturbed_vector = base_vector[:dimensions] + perturbation
        perturbations.append(np.concatenate([perturbed_vector, base_vector[dimensions:]]))

    return perturbations


In [None]:
def find_closest_vector(base_vector, radius=0.2, step_size=0.1):
    perturbations = generate_vectors_within_radius(base_vector, radius, step_size)

    base_vector = np.array(base_vector)
    closest_vector = None
    min_distance = float('inf')

    for perturbation in perturbations:
      perturbation_vector = np.array(perturbation)
      if np.array_equal(perturbation_vector, base_vector):
          continue
      distance = np.linalg.norm(perturbation_vector - base_vector)
      if distance < min_distance:
         min_distance = distance
         closest_vector = perturbation

    return closest_vector


In [None]:
import gensim.downloader as api

model = api.load('word2vec-google-news-300')

def find_synonyms(word, top_n=5):
   try:
      similar_words = model.most_similar(word, topn=top_n)
      synonym_list = [word for word, _ in similar_words]
      return synonym_list
   except KeyError:
      return f"No synonyms found for {word}."

In [None]:
import gensim.downloader as api

# Load the FastText model
model = api.load('fasttext-wiki-news-subwords-300')

def find_synonyms_fastText(word, top_n=5):
    try:
        # Find the most similar words (synonyms)
        similar_words = model.most_similar(word, topn=top_n)
        synonym_list = [word for word, _ in similar_words]
        return synonym_list
    except KeyError:
        return f"No synonyms found for {word}."



Average Load Time taken to load and use the word2vec Model = 7 minutes

In [None]:
def write_vector_to_file(word, required_vector, output_file_path):
    if required_vector is None:
       raise ValueError("The required vector is None. Please provide a valid vector.")

    if not isinstance(required_vector, (list, tuple)):
       raise TypeError("The required vector should be a list or tuple.")

    try:
       with open(output_file_path, 'a', newline='') as f:
          writer = csv.writer(f)
          synonym = find_synonyms(word)
          writer.writerow([synonym[0]] + required_vector)
          print(f"Vector {required_vector} successfully written to {output_file_path}.")
    except Exception as e:
       print(f"An error occurred while writing the vector to the file: {e}")

In [None]:
output_file_path = 'synonyms.csv'

In [None]:
for word in words:
    required_word_vector = existing_vectors[word]

    if required_word_vector is None:
        print(f"Word '{word}' not found in vocabulary.")
    else:
        radius = 0.2
        step_size = 0.1

        # Generate perturbations (potential synonym embeddings)
        all_vectors = generate_vectors_within_radius(required_word_vector, radius, step_size)
        all_vectors.sort(key=np.linalg.norm)

        # Find synonyms
        synonyms = find_synonyms(word)

        # Write only synonym words and embeddings
        with open(output_file_path, 'a', newline='') as f:
            writer = csv.writer(f)
            for i in range(min(len(synonyms), len(all_vectors))):  # Avoid index errors
                synonym_word = synonyms[i]  # Get synonym word
                synonym_embedding = all_vectors[i].tolist()  # Get corresponding embedding
                writer.writerow([synonym_word] + synonym_embedding)  # Write to CSV

                print(f"Written to file: {synonym_word} → {synonym_embedding}")


Written to file: A → [-0.20253517402990084, 0.15772108940995033, 0.3961926491245066, -0.3244696053548872, -0.416071290429584, 0.29040753600652014, 0.19510678158590677, 0.09666599004271689, 0.033948316430202824, 0.11078808282568153, 0.02408807, -0.10806114, 0.08950882, 0.15373066, 0.03151122, -0.029455438, -0.06451302, -0.004259415, -0.37409458, 0.32723752]
Written to file: - → [-0.13143763292427674, 0.2864395617740069, 0.3523180886027323, -0.28105403947883717, -0.4242640381900489, 0.2119585568475357, 0.11244363551278572, 0.011048026242142184, 0.291134750206492, 0.16467096851675647, 0.02408807, -0.10806114, 0.08950882, 0.15373066, 0.03151122, -0.029455438, -0.06451302, -0.004259415, -0.37409458, 0.32723752]
Written to file: " → [-0.25036641816370175, 0.0798697433127688, 0.44734641527006136, -0.22088614989959587, -0.28111398993983927, 0.2846700627253198, 0.11080424679743434, 0.08314480548356393, 0.08900524620560962, 0.3955944870096103, 0.02408807, -0.10806114, 0.08950882, 0.15373066, 0.0

In [None]:
for word in words:
    required_word_vector = existing_vectors[word]
    if required_word_vector is None:
       print(f"Word '{word}' not found in vocabulary.")
    else:
       radius = 0.2
       step_size = 0.1
       required_vector = find_closest_vector(required_word_vector, radius, step_size)

       all_vectors = generate_vectors_within_radius(required_word_vector, radius, step_size)
       all_vectors.sort(key=np.linalg.norm)

       print(required_vector)
       synonym = find_synonyms(word)
       for i in range(int(len(synonym))):
          required_list = all_vectors[i].tolist()
          write_vector_to_file(synonym[i], all_vectors[i].tolist(), output_file_path)

[ 0.31934239 -0.27205637 -0.22424105  0.32003761 -0.00864874  0.1410218
  0.00457637 -0.19252857  0.02259171 -0.19854945  0.1383709  -0.13435909
 -0.19038898 -0.2459652  -0.31585613 -0.3304083   0.2585063   0.33775625
 -0.32571685  0.2370348 ]
Vector [0.208137025590905, 0.016264893951066617, -0.07736779604919519, 0.3050621634156117, -0.10527605172276545, 0.27972690685995827, 0.08884650549319527, -0.19918012532915227, -0.14933884239732165, -0.16419278552156735, 0.1383709, -0.13435909, -0.19038898, -0.2459652, -0.31585613, -0.3304083, 0.2585063, 0.33775625, -0.32571685, 0.2370348] successfully written to synonyms.csv.
Vector [0.08902685186119047, -0.07412513001028265, -0.13803530315661927, 0.15373268114674046, -0.2676136227114414, 0.3159856677467978, 0.16951498134178752, -0.10384733266785137, -0.22272627181885407, -0.13922833356026584, 0.1383709, -0.13435909, -0.19038898, -0.2459652, -0.31585613, -0.3304083, 0.2585063, 0.33775625, -0.32571685, 0.2370348] successfully written to synonyms.

In [None]:
embedding_output_file = 'parent-synonym-embedding.csv'

In [None]:
with open(embedding_output_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Parent Embedding"] + [f"Synonym-{i+1} Embedding" for i in range(5)])

for word in words:
    required_word_vector = existing_vectors[word]
    if required_word_vector is None:
        print(f"Word '{word}' not found in vocabulary.")
    else:
        radius = 0.2
        step_size = 0.1
        all_vectors = generate_vectors_within_radius(required_word_vector, radius, step_size)
        all_vectors.sort(key=np.linalg.norm)

        synonym_words = find_synonyms(word)[:5]
        synonym_vectors = all_vectors[:5]

        with open(embedding_output_file, 'a', newline='') as f:
            writer = csv.writer(f)
            row_data = [required_word_vector] + [vec.tolist() for vec in synonym_vectors]
            writer.writerow(row_data)


In [None]:
try:
      files.download(output_file_path)
      files.download(embedding_output_file)
      print(f"Word vectors saved to {output_file_path}")
except FileNotFoundError:
      print(f"The file '{output_file_path}' was not found. Check if it was created successfully.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Word vectors saved to synonyms.csv
