# Creating Knowledge base for the Aurora Skies Airways ChatBot

In [None]:
from google import genai
import os
from dotenv import load_dotenv
load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")
google_client = genai.Client(api_key=google_api_key)
MODEL_ID = "gemini-embedding-001"

### Setting up Opensearch for Vector DataBase

In [221]:
from opensearchpy import OpenSearch
PORT = 9200
cluster_url = f'https://localhost:{PORT}' 
client = OpenSearch(
        hosts=[cluster_url],
        http_auth=("admin", "admin"),
        use_ssl=True,
        verify_certs=False
    )



### Defining index to store question, answer and question's embedding 


In [None]:
index_name = "questions_embeddings"
index_body = {
    "settings": {
        "index": {
            "knn": True,
            "knn.algo_param.ef_search": 100
        }
    },
    "mappings": {
        "properties": {
            "question": {"type": "text"},
            "answer": {"type": "text"},
            "embedding": {
                "type": "knn_vector",
                "dimension": 384,
                "method": {
                    "name": "hnsw",
                    "space_type": "l2",
                    "engine": "nmslib",
                    "parameters": {
                        "ef_construction": 128,
                        "m": 24
                    }
                }
            }
        }
    }
}

In [224]:
if client.indices.exists(index=index_name):
    client.indices.delete(index=index_name)



In [225]:
if not client.indices.exists(index = index_name):
    client.indices.create(index = index_name, body=index_body)
    print(f"Created index: {index_name}")

Created index: questions_embeddings




## Creating Embedding of Questions

In [None]:
import pandas as pd
from google.genai.types import EmbedContentConfig 
df = pd.read_excel('airline_faq.xlsx')
df.head()

embeddings = []
for question in df['Question']:
    result = google_client.models.embed_content(
        model="gemini-embedding-001",
        contents=question,
        config=EmbedContentConfig(output_dimensionality=384)
    )
    embeddings.append(result.embeddings[0].values)

In [227]:
df['embedding'] = embeddings

In [228]:
df.head()

Unnamed: 0,Question,Answer,embedding
0,Can I get a refund if I cancel my Aurora Skies...,"Yes, Aurora Skies Airways allows refunds withi...","[-0.009947613, -0.002256354, 0.014590447, -0.0..."
1,What happens if Aurora Skies Airways changes m...,If Aurora Skies Airways changes your flight sc...,"[-0.002632272, 0.012297158, -1.3379228e-05, -0..."
2,Are change or cancellation fees applicable to ...,Change or cancellation fees may apply based on...,"[-0.017145563, 0.007999106, -0.0033050058, -0...."
3,How can I modify my Aurora Skies Airways booking?,You can access your booking online to modify y...,"[-0.0042747227, -0.011651363, 0.0166115, -0.04..."
4,What are my options if my Aurora Skies Airways...,"In such cases, Aurora Skies Airways offers the...","[0.004858732, -0.00054524926, 0.0020609286, -0..."


## Bulk Inserting the data into index

In [229]:
bulk_data = []
for _, row in df.iterrows():
    data = {
        "question": row['Question'],
        "answer": row['Answer'],
        "embedding": row['embedding']
    }
    bulk_data.append(data)

In [230]:
bulk_data

[{'question': 'Can I get a refund if I cancel my Aurora Skies Airways flight within 24 hours of booking?',
  'answer': 'Yes, Aurora Skies Airways allows refunds within 24 hours of purchase for all fare types, including published and net fares, as well as tickets with codeshare and interline flights. This policy does not apply to group fares or fares purchased for same-day travel.',
  'embedding': [-0.009947613,
   -0.002256354,
   0.014590447,
   -0.06011162,
   0.00956482,
   0.0058790767,
   0.012156177,
   0.0060026795,
   -0.03343107,
   -0.017105155,
   0.0028463397,
   0.010573974,
   0.0022571618,
   -0.008406812,
   0.12732443,
   0.013471248,
   0.0035227707,
   0.0017970164,
   -0.035715748,
   0.0008409959,
   0.011781135,
   0.011681708,
   0.0030710695,
   0.006254763,
   -0.008492647,
   0.014696902,
   0.015441564,
   0.008218639,
   0.023114523,
   0.026143998,
   -0.011143011,
   -0.019310327,
   -0.02406912,
   -0.004672045,
   0.020498086,
   0.034125693,
   0.001277

In [231]:
from opensearchpy import helpers
helpers.bulk(client,bulk_data, index=index_name)



(10, [])