# Prepare Open AI client object

In [1]:
from openai import OpenAI

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [3]:
client=OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

In [4]:
client

<openai.OpenAI at 0x7f410b0542e0>

# Prepare documents for information retrieval

In [5]:
import minsearch

In [6]:
import json

In [7]:
with open('documents.json', 'rt') as f_in:
    docs_raw=json.load(f_in)

In [8]:
documents=[]

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

# RAG with Elastic Search

In [44]:
from elasticsearch import Elasticsearch

In [46]:
es_client=Elasticsearch('http://localhost:9200')

In [47]:
#es_client.info()

ObjectApiResponse({'name': 'b359dd4497f3', 'cluster_name': 'docker-cluster', 'cluster_uuid': '_7h4zc2MQCKCk6sS0pWVbw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [48]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

#Create an index in elastic search (equivelant to a table in sql database)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [27]:
for doc in documents:
    
    es_client.index(index=index_name, document=doc)

In [28]:
def elastic_search(query):
    
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        #a question is 3 times more important than the text or section for the search results
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": { #limit questions about only one part of the documents available
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    search_results = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in search_results['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [38]:
def build_prompt(query, search_results):
    
    prompt_template = """
    QUESTION: {question}

    CONTEXT:
    {context}
    
    ANSWER:
    """.strip()
    
    context_template = """
        section:{section}
        question: {question}
        answer: {text}
        """.strip()

    context_list=[]
    
    for doc in search_results:
        context_list.append(context_template.format(**doc))
    
    context = '/n/n'.join(context_list)
    
    prompt = prompt_template.format(question=query, context=context).strip()
        
    return prompt

# Load open source model from Hugging Face AI

Explanation of Parameters:

- max_length: Set this to a higher value if you want longer responses. For example, max_length=300.
num_beams: Increasing this can lead to more thorough exploration of possible sequences. Typical values are between 5 and 10.
- do_sample: Set this to True to use sampling methods. This can produce more diverse responses.
temperature: Lowering this value makes the model more confident and deterministic, while higher values increase diversity. Typical values range from 0.7 to 1.5.
- top_k and top_p: These parameters control nucleus sampling. top_k limits the sampling pool to the top k tokens, while top_p uses cumulative probability to cut off the sampling pool. Adjust these based on the desired level of randomness.

## google/flan-t5-xl

In [9]:
# pip install accelerate
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto")


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



<pad> Wie alt sind Sie?</s>


In [16]:
input_text = "translate English to Portuguese: How old are you, Maria?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
input_ids

tensor([[13959,  1566,    12, 21076,    10,   571,   625,    33,    25,     6,
          6538,    58,     1]], device='cuda:0')

In [17]:
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

<pad> Quanto idade têm, Maria?</s>


In [45]:
def llm(prompt):
    
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
    outputs = model.generate(input_ids, max_length=300)

    return tokenizer.decode(outputs[0])

In [46]:
def elastic_rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return search_results, prompt, answer

In [47]:
query= 'i just found out about this course, can i still join?'

In [48]:
search_results, prompt, answer = elastic_rag(query)

In [50]:
answer

"<pad> Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute./n/n</s>"