In [1]:
import io
import requests
import docx

In [44]:
def clean_line(line):
    line = line.strip()
    line = line.strip('\uFEFF')
    return line

def read_faq(file_id):
    url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'
    
    response = requests.get(url)
    response.raise_for_status()
    
    with io.BytesIO(response.content) as f_in:
        doc = docx.Document(f_in)

    questions = []

    question_heading_style = 'heading 2'
    section_heading_style = 'heading 1'
    
    heading_id = ''
    section_title = ''
    question_title = ''
    answer_text_so_far = ''
     
    for p in doc.paragraphs:
        style = p.style.name.lower()
        p_text = clean_line(p.text)
    
        if len(p_text) == 0:
            continue
    
        if style == section_heading_style:
            section_title = p_text
            continue
    
        if style == question_heading_style:
            answer_text_so_far = answer_text_so_far.strip()
            if answer_text_so_far != '' and section_title != '' and question_title != '':
                questions.append({
                    'text': answer_text_so_far,
                    'section': section_title,
                    'question': question_title,
                })
                answer_text_so_far = ''
    
            question_title = p_text
            continue
        
        answer_text_so_far += '\n' + p_text
    
    answer_text_so_far = answer_text_so_far.strip()
    if answer_text_so_far != '' and section_title != '' and question_title != '':
        questions.append({
            'text': answer_text_so_far,
            'section': section_title,
            'question': question_title,
        })

    return questions

In [45]:
faq_documents = {
    # 'llm-zoomcamp': '1qZjwHkvP0lXHiE4zdbWyUXSVfmVGzougDD6N37bat3E',
    'llm-zoomcamp': '1T3MdwUvqCL3jrh3d3VCXQ8xE0UqRzI3bfgpfBq3ZWG0'
}


In [46]:
documents = []

for course, file_id in faq_documents.items():
    print(course)
    course_documents = read_faq(file_id)
    documents.append({'course': course, 'documents': course_documents})

llm-zoomcamp


In [47]:
len(documents)

1

In [48]:
data = documents

In [49]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [50]:
documents = []
for course_dict in data[0]['documents']:
    # print(course_dict)
    course_dict['course'] = 'course llm-zoomcamp'
    # previously we used just "id" for document ID
    course_dict['document_id'] = generate_document_id(course_dict)
    documents.append(course_dict)

print(len(documents))



86


In [51]:

from elasticsearch import Elasticsearch

In [52]:

es_client = Elasticsearch('http://localhost:9200') 

In [53]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 1
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "document_id": {"type": "keyword"}
        }
    }
}


In [54]:
from datetime import datetime
index_name_prefix = 'documents' 

current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
index_name = f"{index_name_prefix}_{current_time}"
print("index name:", index_name)



index name: documents_20240819_114705


In [55]:
for doc in data:
    for document in doc['documents']:

        es_client.index(index=index_name, body=document)
print(document)

{'text': 'Answer', 'section': 'Workshops: X', 'question': 'Question', 'course': 'course llm-zoomcamp', 'document_id': '424a2ae1'}


In [56]:
document

{'text': 'Answer',
 'section': 'Workshops: X',
 'question': 'Question',
 'course': 'course llm-zoomcamp',
 'document_id': '424a2ae1'}

In [58]:
query = {
    "query": {
        "match": {
            "question": "When is the next cohort?"
        }
    }
}

response = es_client.search(index=index_name, body=query)

for hit in response['hits']['hits']:
    print(f"Document ID: {hit['_id']}")
    print(f"Question: {hit['_source']['question']}")
    print(f"Answer: {hit['_source']['text']}\n")

Document ID: -CyJa5EB61VPaVMdBx4V
Question: When is the next cohort?
Answer: Summer 2026.

Document ID: JiyJa5EB61VPaVMdBx_v
Question: What is the cosine similarity?
Answer: Cosine similarity is a measure used to calculate the similarity between two non-zero vectors, often used in text analysis to determine how similar two documents are based on their content. This metric computes the cosine of the angle between two vectors, which are typically word counts or TF-IDF values of the documents. The cosine similarity value ranges from -1 to 1, where 1 indicates that the vectors are identical, 0 indicates that the vectors are orthogonal (no similarity), and -1 represents completely opposite vectors.

Document ID: OyyJa5EB61VPaVMdCB9R
Question: There is an error when opening the table using dbtable = db.open_table("notion_pages___homework"): FileNotFoundError: Table notion_pages___homework does not exist.Please first call db.create_table(notion_pages___homework, data)
Answer: The error indica