In [1]:
import pickle
#Load Dictionary
with open('transcripts_timestamps.pkl', 'rb') as infile:
    transcripts = pickle.load(infile)

In [2]:
def combine_text_fragments(documents, chunk_size=3):
    combined_docs = []
    current_chunk = {"Title": documents[0]['Title'], "Text": "", "TimeStamp": documents[0]['TimeStamp']}
    
    for i, doc in enumerate(documents):
        if i % chunk_size == 0 and i != 0:
            combined_docs.append(current_chunk)
            current_chunk = {"Title": doc['Title'], "Text": "", "TimeStamp": doc['TimeStamp']}
        
        current_chunk['Text'] += f" {doc['Text']}"
    
    combined_docs.append(current_chunk)
    return combined_docs

# Example usage
documents = transcripts
combined_documents = combine_text_fragments(documents)

In [3]:
len(combined_documents)

4762

In [4]:
len(transcripts)

14284

In [5]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')  # Or any suitable model

def generate_embeddings(docs):
    for doc in docs:
        doc['embedding'] = model.encode(doc['Text'])
    return docs

combined_documents = generate_embeddings(combined_documents)


  from tqdm.autonotebook import tqdm, trange
2024-08-25 12:16:31.801941: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-25 12:16:31.927623: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-25 12:16:31.928261: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-25 12:16:32.116506: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  retur

In [17]:
len(combined_documents[0]['embedding'])

384

In [18]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

index_name = "transcripts"

# Delete the existing index (caution: this will delete all documents in the index)
es.indices.delete(index=index_name, ignore=[404])

# Define the correct mapping
mapping = {
    "mappings": {
        "properties": {
            "Text": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "TimeStamp": {"type": "float"},
            "Title": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "embedding": {
                "type": "dense_vector",
                "dims": 384  # Replace with the actual dimension of your embeddings
            }
        }
    }
}

# Create the index with the correct mapping
es.indices.create(index=index_name, body=mapping)


  es.indices.delete(index=index_name, ignore=[404])


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'transcripts'})

In [20]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

def index_documents(docs, index_name="transcripts"):
    es = Elasticsearch("http://localhost:9200")

    actions = [
        {
            "_index": index_name,
            "_source": doc
        }
        for doc in docs
    ]
    bulk(es, actions)

index_documents(combined_documents)

In [19]:
#Check index
es = Elasticsearch("http://localhost:9200")
index_mapping = es.indices.get_mapping(index="transcripts")
print(index_mapping)

{'transcripts': {'mappings': {'properties': {'Text': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'TimeStamp': {'type': 'float'}, 'Title': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'embedding': {'type': 'dense_vector', 'dims': 384}}}}}


In [37]:
import numpy as np

def retrieve_documents_question(query, index_name="transcripts", max_results=5):
    es = Elasticsearch("http://localhost:9200")
    
    # Generate query embedding
    query_embedding = model.encode(query)
    
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "should": [
                    {
                        "multi_match": {
                            "query": query,
                            "fields": ["Title", "Text^3"],
                            "type": "most_fields"
                        }
                    },
                    {
                        "script_score": {
                            "query": {"match_all": {}},
                            "script": {
                                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                                "params": {"query_vector": query_embedding}
                            }
                        }
                    }
                ]
            }
        },
        "_source": ["Title", "Text", "TimeStamp"],
        # "sort": [
        #     {"TimeStamp": {"order": "asc"}}
        # ]
    }
    
    try:
        response = es.search(index=index_name, body=search_query)
        result_docs = [hit['_source'] for hit in response['hits']['hits']]
        return result_docs
    except Exception as e:
        print(f"Error retrieving documents: {e}")
        return []

# Example usage
query = "introduction to LLM"
# query = generate_embeddings(query)
results = retrieve_documents_question(query)
for result in results:
    print(result)


{'Title': 'llm zoomcamp 4.1 - introduction to monitoring answer quality', 'Text': " but for today we're just going to do an extended introduction like what are the topics that we are going to cover and", 'TimeStamp': 82.84}
{'Title': 'llm zoomcamp 3.2 - semantic search with elasticsearc', 'Text': " unit one introduction section there was a docker command that was provided right here i'm going to", 'TimeStamp': 52.44}
{'Title': 'llm zoomcamp 4.1 - introduction to monitoring answer quality', 'Text': " ask the llm like we're going to write a prompt where we ask the llm like look this is the llm answer that i received", 'TimeStamp': 485.12}
{'Title': 'llm zoomcamp 1.1 - introduction to llm and rag', 'Text': ' too late to join the course the llm has no idea but now these documents they provide the context for llm to figure', 'TimeStamp': 847.959}
{'Title': 'llm zoomcamp 1.1 - introduction to llm and rag', 'Text': ' prompt to the llm so llm receives the prompt it has the question it has the 

In [38]:
# Example usage
query = "Streamlit"
# query = generate_embeddings(query)
results = retrieve_documents_question(query)
for result in results:
    print(result)

{'Title': 'llm zoomcamp 2.9 - creating a streamlit ui', 'Text': " so right now let's take all the code we have here and put it to our stream l application so we will start", 'TimeStamp': 240.439}
{'Title': 'llm zoomcamp 2.9 - creating a streamlit ui', 'Text': ' are many many things you can do stream lead is the simplest one but in your project you can explore any anything you', 'TimeStamp': 412.84}
{'Title': 'llm zoomcamp 2.9 - creating a streamlit ui', 'Text': ' uh i want you to create a stream lead application', 'TimeStamp': 60.44}
{'Title': 'llm zoomcamp 2.9 - creating a streamlit ui', 'Text': " um yeah let's um let's run it how do we run it stream l run app okay", 'TimeStamp': 170.36}
{'Title': 'llm zoomcamp 2.9 - creating a streamlit ui', 'Text': ' creating um the output so i will do peep install stream l stream l is a simp', 'TimeStamp': 139.28}


In [33]:
from elasticsearch import Elasticsearch, exceptions

def retrieve_documents_question(query, index_name="transcripts", max_results=28):
    es = Elasticsearch("http://localhost:9200")
    
    search_query = {
        "size": max_results,
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["Title", "Text^4"],
                "type": "most_fields"
            }
        }
    }
    
    try:
        response = es.search(index=index_name, body=search_query)
        result_docs = [hit['_source'] for hit in response['hits']['hits']]
        return result_docs
    except exceptions.BadRequestError as e:
        print(f"Bad request error: {e.info}")
        return []
    except exceptions.NotFoundError:
        print(f"Index {index_name} not found.")
        return []
    except Exception as e:
        print(f"Error retrieving documents: {e}")
        return []

# Test the function
query = "introduction to LLM"
results = retrieve_documents_question(query)
for result in results:
    print(result)


{'Title': 'llm zoomcamp 4.1 - introduction to monitoring answer quality', 'Text': " but for today we're just going to do an extended introduction like what are the topics that we are going to cover and", 'TimeStamp': 82.84, 'embedding': [-0.021802669391036034, -0.007218490354716778, 0.07080980390310287, -0.0048026349395513535, 0.07323630154132843, 0.058828916400671005, -0.03014850616455078, 0.0054107247851789, -0.03981545567512512, -0.015639424324035645, -0.0652032196521759, 0.11812260746955872, -0.11990747600793839, -0.02471958100795746, 0.0773131251335144, -0.08549810945987701, 0.02012459561228752, -0.08097733557224274, -0.0004959927755407989, 0.03733063116669655, 0.036773040890693665, 0.04368416965007782, 0.03807557746767998, -0.016855977475643158, -0.08437167853116989, 0.014866476878523827, 0.016889361664652824, -0.009174386039376259, 0.06397150456905365, 0.052782390266656876, -0.028715765103697777, 0.07357577234506607, 0.0350617989897728, 0.011864016763865948, -0.03749020025134086