In [1]:
import json
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

  from tqdm.autonotebook import tqdm, trange
2024-08-28 18:36:29.303097: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-28 18:36:29.440555: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-28 18:36:29.441182: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-28 18:36:29.642326: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
with open('enriched.json') as f:
    data = json.load(f)


In [4]:
for rec in data:
    rec['search_field'] = rec['Title'].split('-')[-1] + rec['enriched_content']

In [5]:
for rec in data:
    rec['search_vector'] = model.encode(rec['search_field'])

In [7]:
len(data[0]['search_vector'])

384

In [8]:
data[0].keys()

dict_keys(['Title', 'Text', 'Start_TimeStamp', 'End_TimeStamp', 'link', 'enriched_content', 'search_field', 'search_vector'])

In [9]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "Title": {"type": "text"},
            "Text": {"type": "text"},
            "Start_TimeStamp": {"type": "text"},
            "End_TimeStamp": {"type": "keyword"} ,
            "link": {"type": "keyword"} ,
            "enriched_content": {"type": "keyword"} ,
            "search_field": {"type": "keyword"} ,
            "search_vector": {"type": "dense_vector", "dims": 384, "index": True, "similarity": "cosine"},
        }
    }
}

In [10]:
index_name = "video-content"

es.indices.delete(index=index_name, ignore_unavailable=True)
es.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'video-content'})

In [11]:
for doc in data:
    try:
        es.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

In [12]:

search_term = "When did we first start talking about orchestration?"
vector_search_term = model.encode(search_term)

In [15]:
query = {
    "field": "search_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000, 
}
res = es.search(index=index_name, knn=query, source=["link"])
res["hits"]["hits"]

[{'_index': 'video-content',
  '_id': 'u_Bcm5EByv1_kUiazi-K',
  '_score': 0.66213006,
  '_source': {'link': 'https://www.youtube.com/watch?v=nuk7_soKMUA&t=126s'}},
 {'_index': 'video-content',
  '_id': 'oPBcm5EByv1_kUiazi8R',
  '_score': 0.6594509,
  '_source': {'link': 'https://www.youtube.com/watch?v=gP2ZOsG9Umg&t=1s'}},
 {'_index': 'video-content',
  '_id': 'uvBcm5EByv1_kUiazi-E',
  '_score': 0.6547475,
  '_source': {'link': 'https://www.youtube.com/watch?v=nuk7_soKMUA&t=94s'}},
 {'_index': 'video-content',
  '_id': 'r_Bcm5EByv1_kUiazi9S',
  '_score': 0.64250517,
  '_source': {'link': 'https://www.youtube.com/watch?v=8wrArv0DEKc&t=1s'}},
 {'_index': 'video-content',
  '_id': 'ofBcm5EByv1_kUiazi8X',
  '_score': 0.6405059,
  '_source': {'link': 'https://www.youtube.com/watch?v=gP2ZOsG9Umg&t=33s'}}]

In [None]:
{'Title': 'llm zoomcamp 1.1 - introduction to llm and rag',
  'Text': "hi everyone welcome to our course this is our first module for first unit so in this course the course is called llm zoom camp in this course we will learn about practical applications of llm and in particular we will focus our attention on rack retrieval a generation i'll shortly talk about these variations what they mean um and what we exactly will do and i want to start first with explaining the problem we are going to",
  'Start_TimeStamp': 0.719,
  'End_TimeStamp': 30.039,
  'link': 'https://www.youtube.com/watch?v=Q75JgLEXMsM&t=0s',
  'enriched_content': "hi everyone welcome to our course this is our first module for first unit so in this course the course is called llm zoom camp in this course we will learn about practical applications of llm and in particular we will focus our attention on rack retrieval a generation i'll shortly talk about these variations what they mean um and what we exactly will do and i want to start first with explaining the problem we are going to use uh to solve throughout the course um so this will be our running problem and in our community in data do club we have multiple courses so this llm zoom camp is our fifth course and usually in our courses we have frequently asked questions so there are questions that uh there are no answers in the videos or answers are not uh easy to find and we have these documents"},
 