In [1]:
import numpy as np

In [2]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)

    return sorted(words), word_to_vec_map

In [3]:
vocab, word_to_vec_dict=read_glove_vecs('glove.6B.300d.txt')

In [4]:
print(len(vocab),type(vocab))
print(vocab[100000:100010])

400000 <class 'list'>
['chording', 'chordoma', 'chordophones', 'chords', 'chore', 'chorea', 'chorene', 'choreograph', 'choreographed', 'choreographer']


In [5]:
vocab=['<SOS>','<EOS>']+vocab

In [6]:
print(len(vocab),type(vocab))
print(vocab[0:3])

400002 <class 'list'>
['<SOS>', '<EOS>', '!']


In [7]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

vectorize_layer = TextVectorization(
    output_mode='int',
    vocabulary=vocab)

2025-06-01 14:01:41.019032: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-01 14:01:41.191121: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748766701.253770   20629 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748766701.274559   20629 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748766701.411289   20629 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [8]:
vector_vocab=vectorize_layer.get_vocabulary()

In [9]:
print(len(vector_vocab),type(vector_vocab))

400004 <class 'list'>


In [10]:
print(vector_vocab[0:10])

['', '[UNK]', '<SOS>', '<EOS>', '!', '!!', '!!!', '!!!!', '!!!!!', '!?']


In [11]:
def create_dict(vocab):
    i = 0
    words_to_index = {}
    index_to_words = {}
    for word in vocab:
        words_to_index[word] = i
        index_to_words[i] = word
        i = i + 1
    print("completed")
    return words_to_index,index_to_words

In [12]:
words_to_index_dict,index_to_words_dict=create_dict(vector_vocab)

completed


In [13]:
print(len(words_to_index_dict),len(index_to_words_dict))

400004 400004


In [15]:
words_to_index_dict

{'': 0,
 '[UNK]': 1,
 '<SOS>': 2,
 '<EOS>': 3,
 '!': 4,
 '!!': 5,
 '!!!': 6,
 '!!!!': 7,
 '!!!!!': 8,
 '!?': 9,
 '!?!': 10,
 '"': 11,
 '#': 12,
 '##': 13,
 '###': 14,
 '#a': 15,
 '#aabccc': 16,
 '#b': 17,
 '#c': 18,
 '#cc': 19,
 '#ccc': 20,
 '#cccccc': 21,
 '#ccccff': 22,
 '#d': 23,
 '#daa': 24,
 '#dcdcdc': 25,
 '#e': 26,
 '#f': 27,
 '#faf': 28,
 '#ff': 29,
 '#ffffff': 30,
 '#m': 31,
 '#p': 32,
 '#s': 33,
 '#the': 34,
 '#ukqa': 35,
 '#ukqaqfqs': 36,
 '#ukqaqs': 37,
 '#ukqaqsqf': 38,
 '#ukqaqsqm': 39,
 '#ukqaqtqszbzszr': 40,
 '#ukqec': 41,
 '#ukqeqtqszb': 42,
 '$': 43,
 '%': 44,
 '&': 45,
 '&#8211;': 46,
 '&#8217;': 47,
 '&#8220;': 48,
 '&#8221;': 49,
 '&amp;': 50,
 "'": 51,
 "''": 52,
 "'00": 53,
 "'01": 54,
 "'02": 55,
 "'03": 56,
 "'04": 57,
 "'05": 58,
 "'06": 59,
 "'07": 60,
 "'08": 61,
 "'09": 62,
 "'10": 63,
 "'11": 64,
 "'12": 65,
 "'13": 66,
 "'14": 67,
 "'15": 68,
 "'20": 69,
 "'20s": 70,
 "'25": 71,
 "'27": 72,
 "'28": 73,
 "'29": 74,
 "'30": 75,
 "'30s": 76,
 "'32": 77,
 "'3

In [16]:
index_to_words_dict

{0: '',
 1: '[UNK]',
 2: '<SOS>',
 3: '<EOS>',
 4: '!',
 5: '!!',
 6: '!!!',
 7: '!!!!',
 8: '!!!!!',
 9: '!?',
 10: '!?!',
 11: '"',
 12: '#',
 13: '##',
 14: '###',
 15: '#a',
 16: '#aabccc',
 17: '#b',
 18: '#c',
 19: '#cc',
 20: '#ccc',
 21: '#cccccc',
 22: '#ccccff',
 23: '#d',
 24: '#daa',
 25: '#dcdcdc',
 26: '#e',
 27: '#f',
 28: '#faf',
 29: '#ff',
 30: '#ffffff',
 31: '#m',
 32: '#p',
 33: '#s',
 34: '#the',
 35: '#ukqa',
 36: '#ukqaqfqs',
 37: '#ukqaqs',
 38: '#ukqaqsqf',
 39: '#ukqaqsqm',
 40: '#ukqaqtqszbzszr',
 41: '#ukqec',
 42: '#ukqeqtqszb',
 43: '$',
 44: '%',
 45: '&',
 46: '&#8211;',
 47: '&#8217;',
 48: '&#8220;',
 49: '&#8221;',
 50: '&amp;',
 51: "'",
 52: "''",
 53: "'00",
 54: "'01",
 55: "'02",
 56: "'03",
 57: "'04",
 58: "'05",
 59: "'06",
 60: "'07",
 61: "'08",
 62: "'09",
 63: "'10",
 64: "'11",
 65: "'12",
 66: "'13",
 67: "'14",
 68: "'15",
 69: "'20",
 70: "'20s",
 71: "'25",
 72: "'27",
 73: "'28",
 74: "'29",
 75: "'30",
 76: "'30s",
 77: "'32",
 78:

In [20]:
_=0
for x in word_to_vec_dict.keys():
    print(f'{x} :{word_to_vec_dict[x]}')
    _+=1
    if _==5:
        break

the :[ 4.6560e-02  2.1318e-01 -7.4364e-03 -4.5854e-01 -3.5639e-02  2.3643e-01
 -2.8836e-01  2.1521e-01 -1.3486e-01 -1.6413e+00 -2.6091e-01  3.2434e-02
  5.6621e-02 -4.3296e-02 -2.1672e-02  2.2476e-01 -7.5129e-02 -6.7018e-02
 -1.4247e-01  3.8825e-02 -1.8951e-01  2.9977e-01  3.9305e-01  1.7887e-01
 -1.7343e-01 -2.1178e-01  2.3617e-01 -6.3681e-02 -4.2318e-01 -1.1661e-01
  9.3754e-02  1.7296e-01 -3.3073e-01  4.9112e-01 -6.8995e-01 -9.2462e-02
  2.4742e-01 -1.7991e-01  9.7908e-02  8.3118e-02  1.5299e-01 -2.7276e-01
 -3.8934e-02  5.4453e-01  5.3737e-01  2.9105e-01 -7.3514e-03  4.7880e-02
 -4.0760e-01 -2.6759e-02  1.7919e-01  1.0977e-02 -1.0963e-01 -2.6395e-01
  7.3990e-02  2.6236e-01 -1.5080e-01  3.4623e-01  2.5758e-01  1.1971e-01
 -3.7135e-02 -7.1593e-02  4.3898e-01 -4.0764e-02  1.6425e-02 -4.4640e-01
  1.7197e-01  4.6246e-02  5.8639e-02  4.1499e-02  5.3948e-01  5.2495e-01
  1.1361e-01 -4.8315e-02 -3.6385e-01  1.8704e-01  9.2761e-02 -1.1129e-01
 -4.2085e-01  1.3992e-01 -3.9338e-01 -6.7945e-

In [21]:
def save_to_disk(obj,name):
    import pickle

    # Save to disk
    with open(f"./Vocabulary/{name}.pkl", "wb") as f:
        pickle.dump(obj, f)
        print(f'{name}.pkl created at ./Vocabulary/{name}.pkl')

In [23]:
save_to_disk(word_to_vec_dict,'word_to_vec_dict')
save_to_disk(vocab,'vocab')
save_to_disk(words_to_index_dict,'words_to_index_dict')
save_to_disk(index_to_words_dict,'index_to_words_dict')

word_to_vec_dict.pkl created at ./Vocabulary/word_to_vec_dict.pkl
vocab.pkl created at ./Vocabulary/vocab.pkl
words_to_index_dict.pkl created at ./Vocabulary/words_to_index_dict.pkl
index_to_words_dict.pkl created at ./Vocabulary/index_to_words_dict.pkl


In [24]:
save_to_disk(vector_vocab,'vector_vocab')

vector_vocab.pkl created at ./Vocabulary/vector_vocab.pkl
