# Downloads

In [10]:
#!pip install langchain langchain_community langchain_huggingface faiss-cpu
#!pip install sentence-transformers
#!pip install hf_xet

# Imports

In [26]:
import pandas as pd
import os
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Loading Data

In [12]:
csv_files = [f for f in os.listdir('../Cleaned Data/') if f.endswith('.csv')]
dataframes = {}

for csv_file in csv_files:
    file_path = os.path.join('../Cleaned Data/', csv_file)
    df_name = os.path.splitext(csv_file)[0]
    try:
        dataframes[df_name] = pd.read_csv(file_path)
        print(f"Loaded {csv_file} into dataframe '{df_name}'")
    except Exception as e:
        print(f"Error loading {csv_file}: {e}")

Loaded Chapter and chapter section main objectives.csv into dataframe 'Chapter and chapter section main objectives'
Loaded Chapter information.csv into dataframe 'Chapter information'
Loaded Chapter section walkthrough with mission tips and strategy and chapter section images.csv into dataframe 'Chapter section walkthrough with mission tips and strategy and chapter section images'
Loaded Character Information.csv into dataframe 'Character Information'
Loaded Enemy information.csv into dataframe 'Enemy information'
Loaded Safecodes Information.csv into dataframe 'Safecodes Information'
Loaded Trophy Information.csv into dataframe 'Trophy Information'


# Creating Documents

In [13]:
documents = []
for df_name, df in dataframes.items():
    for index, row in df.iterrows():
        content = f"Information from {df_name}:\n"
        for col, value in row.items():
            content += f"{col}: {value}\n"
        documents.append(Document(page_content=content))

print(f"Created {len(documents)} documents.")

Created 175 documents.


# Creating Chunks

Find that answers are missing context, increase the `chunk_overlap`
Find that the chatbot is retrieving too much irrelevant information, decrease the `chunk_size`

Test one - `chunk_size` = 500, `chunk_overlap`=50

`chunk_overlap` is used to retain continuity and context(history)

In [14]:
def create_chunks(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [15]:
text_chunks= create_chunks(documents)
print(f"Length of text chunks {len(text_chunks)}")

Length of text chunks 423


# Creating Vector Embeddings

In [19]:
def get_ve_model():
    ve_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")
    return ve_model

In [20]:
embedding_model = get_ve_model()

# Storing Vector Embeddings in FAISS

In [23]:
FAISS_DB_PATH = 'FAISS Database/'

db=FAISS.from_documents(text_chunks, embedding_model)
db.save_local(FAISS_DB_PATH)