# Building the Lakehouse

Sentence Transformers

Ref: https://www.sbert.net/docs/pretrained-models/msmarco-v3.html


In [23]:
import faiss  # Make sure to install FAISS
import numpy as np
import pickle

import openai
import torch

import os
from dotenv import load_dotenv

# sentence transformers
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('msmarco-distilbert-base-tas-b')

# openai
load_dotenv('.env')
openai.api_key = os.environ.get('OPENAI_API_KEY')

In [24]:
VEC_DIM = 1536

In [25]:
def get_embedding(text):
    return torch.as_tensor(openai.Embedding.create(input=text, engine="text-embedding-ada-002")['data'][0]['embedding'])

def embed(text):
    return torch.as_tensor(model.encode(text))

In [27]:
if not os.path.isfile("faiss_index.pkl"):
    string_list = ["This course serves as an introductory overview of the rapidly evolving field of machine learning. \
                    Students will acquire a foundational understanding of the various algorithms, techniques, and applications \
                    that make up this discipline. Topics covered include supervised learning, unsupervised learning, \
                    neural networks, decision trees, and support vector machines. Hands-on labs and assignments will provide \
                    practical experience with Python and machine learning libraries such as scikit-learn and TensorFlow.",
                    
                "This course delves into the life, times, and literary contributions of William Shakespeare. Students will \
                    study a selection of his plays, sonnets, and other works to understand their thematic depth, historical \
                    context, and linguistic richness. Literary theories and critical analysis techniques will be used to dissect \
                    the texts.",
                    
                "This course explores the economic aspects of environmental issues such as pollution, conservation, and \
                    sustainable development. Utilizing economic theories and models, students will analyze the market's impact on \
                    the environment and evaluate policy solutions. Topics include externalities, public goods, cost-benefit \
                    analysis, and environmental ethics.",
                    
                "This course traces the development of modern art from the late 19th century to the contemporary era. \
                    Students will explore the various art movements, such as Impressionism, Cubism, and Abstract Expressionism, \
                    and examine their influence on culture and society. Field trips to art galleries and museums will \
                    complement classroom learning."]

    vector_list = [get_embedding(s) for s in string_list]

    vectors = np.array(vector_list)

    index = faiss.IndexFlatL2(VEC_DIM)

    index.add(vectors)

    with open("faiss_index.pkl", "wb") as f:
        pickle.dump(index, f)

    print("FAISS index created and saved.")


FAISS index created and saved.


In [28]:
import pickle
import numpy as np

# Load FAISS index from a pickle file
with open("faiss_index.pkl", "rb") as f:
    loaded_index = pickle.load(f)

print("FAISS index loaded.")

# Create a query vector (same dimensionality as original vectors)
query_vector = get_embedding("How can I improve the gross domestc product of my own country?").reshape(1, -1)

# Perform a search on the loaded index
# The function `search` returns two arrays: distances and indices
# The 'k' in the search function specifies that we want to find the nearest 'k' neighbors
k = 4
distances, indices = loaded_index.search(query_vector, k)

print("Distances:", distances)
print("Indices:", indices)


FAISS index loaded.
Distances: [[0.53524816 0.59832144 0.6324303  0.6413039 ]]
Indices: [[2 3 0 1]]
