# First imports

In [None]:
!pip install -q condacolab
import condacolab
condacolab.install() # expect a kernel restart

✨🍰✨ Everything looks OK!


In [None]:

from tqdm import tqdm
import numpy as np
from openai import OpenAI
from google.colab import userdata

# Resource check

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

/bin/bash: line 1: nvidia-smi: command not found


In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


# install faiss

In [None]:
!conda install -c pytorch -c nvidia faiss-gpu=1.9.0

Channels:
 - pytorch
 - nvidia
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / done
Solving environment: \ | / done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - faiss-gpu=1.9.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _openmp_mutex-4.5          |       3_kmp_llvm           7 KB  conda-forge
    ca-certificates-2025.1.31  |       hbcca054_0         154 KB  conda-forge
    certifi-2025.1.31          |     pyhd8ed1ab_0         159 KB  conda-forge
    conda-24.11.3              |  py311h38be061_0         1.1 MB  conda-forge
    cuda-cudart-11.8.89        |                0         197 KB  nvidia
    faiss-gpu-1.9.0            |py3.11_h1468078_0_cuda11.4.4         5.4 MB  pytorch
    libblas-3.9

In [None]:
import faiss

In [None]:
# generate a sample of 800 words:

# test_words = []
# test_indices = np.random.randint(0, 10000, 800)
# with open("/content/drive/MyDrive/wordlist.10000.txt", 'r') as f:
#   all_words = f.readlines()
#   for idx in test_indices:
#     test_words.append(all_words[idx].strip())

# # save the test words to a file
# with open("/content/drive/MyDrive/test_words", 'w') as f:
#   f.write(",".join(test_words))

# print(test_words)


# Load Test Words

In [None]:
# load the test  words
test_words = []
with open("/content/drive/MyDrive/test_words", 'r') as f:
  all_words = f.readlines()[0].split(",")[:-1] # removing the last empty string
  for w in all_words:
    test_words.append(w.strip())

print(test_words)

TypeError: a bytes-like object is required, not 'str'

# Test OpenAi and definitions - results bad

In [None]:

client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
model = "text-embedding-3-large"
dimensions = 3072
words = test_words
embeddings = np.zeros((len(words), dimensions))

def get_embedding(text, model=model):
   return client.embeddings.create(input = [text], model=model).data[0].embedding


In [None]:
pbar = tqdm(total=100)

for idx, word in enumerate(words):
  embeddings[idx] = get_embedding(word)
  pbar.update(100/len(words))

pbar.close()

In [None]:
# save the test embeddings

np.save("/content/drive/MyDrive/Data/test_embeddings.npy", embeddings)


# Utils etc

In [None]:


# Build HNSW index
def build_hnsw_index(embeddings, dimensions, M=32, efConstruction=100):
  index = faiss.IndexHNSWFlat(dimensions, 32)  # M is a tunable parameter
  index.hnsw.efConstruction = 100  # Higher efConstruction leads to better accuracy
  index.add(embeddings)
  return index



ModuleNotFoundError: No module named 'faiss'

In [None]:
test_word_to_id = {word: idx for idx, word in enumerate(words)}

In [None]:
def get_neighbour_words(query_word, k):
  query_vector = embeddings[test_word_to_id[query_word]][None,:]
  distances, indices = index.search(query_vector, k)
  print(f"distances : {distances}")
  print(f"indices : {indices}")

  return [words[i] for i in indices[0]]

def get_distance_between_words(word_1, word_2 ,similarity="cosine"):
  query_vector_1 = embeddings[test_word_to_id[word_1]]
  query_vector_2 = embeddings[test_word_to_id[word_2]]
  if similarity == "cosine":
    return np.dot(query_vector_1, query_vector_2) / (np.linalg.norm(query_vector_1) * np.linalg.norm(query_vector_2))







In [None]:
import zipfile
import pandas as pd
from io import TextIOWrapper


def extract_and_load(file_path, number_of_rows):
  with open(file_path, 'rb') as f:
    df = pd.read_csv(f, nrows=number_of_rows)  # Load only first 500 rows

  return df




def extract_and_load_zipped(zip_path, number_of_rows):
    # Open the zip file
    with zipfile.ZipFile(zip_path, 'r') as z:
        # Assuming the ZIP contains only one CSV file, get its name
        csv_filename = z.namelist()[0]  # Get the first file name in the ZIP

        # Open the CSV file inside the ZIP and read only the first 500 rows
        with z.open(csv_filename) as csv_file:
            df = pd.read_csv(TextIOWrapper(csv_file), nrows=number_of_rows)  # Load only first 500 rows

            return df


In [None]:
print(test_words)

In [None]:
result_words = get_neighbour_words("payday", 10)
print(result_words)

In [None]:
print(get_distance_between_words("payday", "craps"))

In [None]:
k = 10
query_word = "policy"

query_vector = embeddings[test_word_to_id[query_word]]
neighbour_indices = []
similarities = np.dot(embeddings, query_vector) / (np.linalg.norm(embeddings, axis=1) * np.linalg.norm(query_vector))

sorted_indices = np.argsort(similarities)[::-1]
neighbour_words = test_words[sorted_indices][1:k+1]

print(neighbour_words)


# Test GloVe embeddings

###Load some example words

In [None]:
from sys import set_coroutine_origin_tracking_depth
import numpy as np
number_of_rows = 10000

word_to_idx = {}

with open("/content/drive/MyDrive/Data/glove.6B.300d.txt", 'r') as f:

  for i in range(number_of_rows):
    word_to_idx[ f.readline().split()[0] ] = i








In [None]:
with open("/content/drive/MyDrive/Data/glove.6B.300d.txt", 'r') as f:
  eb = np.loadtxt(
    f,
    delimiter=' ',
    skiprows=1,
    usecols=range(1, 300),
    max_rows=500
  )

### Build an index

In [None]:
# Build HNSW index
index = faiss.IndexHNSWFlat(dimensions, 32)  # M is a tunable parameter
index.hnsw.efConstruction = 100  # Higher efConstruction leads to better accuracy
index.add(eb)



NameError: name 'faiss' is not defined

# Test

# Test conceptnet-numberbatch