# **IMPORT**

In [1]:
# https://medium.com/@zilliz_learn/vectorizing-json-data-with-milvus-for-similarity-search-1f546173162c

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import GPT4AllEmbeddings, OpenAIEmbeddings
from langchain.llms import CTransformers
from langchain.chains.llm import LLMChain
from langchain_community.graphs import Neo4jGraph
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.chains import RetrievalQA, GraphCypherQAChain
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI

from langchain.prompts import (
    PromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
)

from neo4j import GraphDatabase

import numpy as np
import pandas as pd
import getpass
import re
import gdown

import json
import os
import pickle

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

os.chdir('F:\\UNIVERSITY\\UNIVERSITY_DOCUMENTS\\DS307\\langchain_rec')

# **FUNCTION**

In [9]:
def load_pickle(path):
    with open(path, 'rb') as file:
        data = pickle.load(file)
    return data


# **LOAD DATA**

In [4]:
db_csv = pd.read_csv('csv/game_description_df.csv')
db_csv.head()

Unnamed: 0,steam_appid,detailed_description,about_the_game,short_description
0,723320,this dlc contains bonus voices for all standar...,this dlc contains bonus voices for all standar...,this dlc contains bonus voices for all standar...
1,723380,fight the wraith as you struggle to survive in...,fight the wraith as you struggle to survive in...,wraith is an action-packed horror survival gam...
2,723420,this is a vr game about traditional chinese my...,this is a vr game about traditional chinese my...,《创世-修真录》is a vr game about traditional chinese...
3,723430,执此剑者，必仁义当先，动必有道，语必有理，求必有义，行必有正，是为君子。君子于乱世，当有执剑...,执此剑者，必仁义当先，动必有道，语必有理，求必有义，行必有正，是为君子。君子于乱世，当有执剑...,《隐龙传：影踪》众筹玩家独家武器dlc，君子之剑——斩玉剑。
4,723460,the night partythey met at a midnight rave par...,the night partythey met at a midnight rave par...,the night party they met at a midnight rave pa...


In [6]:
data = pd.read_csv('csv/only_description.csv')

# **EMBEDDING MODEL**

In [None]:
# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer("intfloat/multilingual-e5-base")

In [10]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base", model_kwargs={"trust_remote_code":True})

# **Parent Document Retriever**

In [11]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import faiss

In [13]:
# retriever = load_pickle('retriever.pkl')
# vectorstore = 

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [25]:
docs = []
list_steam_appids, list_descriptions = data['steam_appid'], data['clean_description']
for appid, description in zip(list_steam_appids[:1], list_descriptions[:1]):
    description = str(description)
    document = Document(
        page_content=description,
        # metadata= {"id": appid}, 
    )
    document_dic = {
        'page_content': document
    }
    docs.append(document_dic)
    # docs.append(description)

In [None]:
# Example using a corrected store implementation

from langchain.vectorstores import FAISS
# from langchain.vectorstores import InMemoryStore  # Ensure InMemoryStore is imported from the correct module
# from langchain_community.docstore.base import AddableMixin
from langchain.storage import InMemoryByteStore
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss

# Initialize documents
docs = []
list_steam_appids, list_descriptions = data['steam_appid'], data['clean_description']

# Only process a smaller subset for testing
for appid, description in zip(list_steam_appids, list_descriptions):
    description = str(description)
    
    # Create a document object
    document = Document(
        page_content=description,
        metadata= {"id": appid}, 
    )
    docs.append(document)

# Text Splitter (using smaller chunks)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=64, chunk_overlap=32)

# Ensure embedding query returns a valid dimensionality
sample_text = 'Random World Iu Hai Quynh Nhat'
embedding_dim = len(embeddings.embed_query(sample_text))  # Check the dimension

nlist = 100  # Number of partitions in FAISS index
# quantizer = faiss.IndexFlatL2(embedding_dim)
# index = faiss.IndexIVFFlat(quantizer, embedding_dim, nlist)
index = faiss.IndexFlatL2(embedding_dim)
# Correctly initialize the FAISS vector store
vectorstore = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),  # Make sure InMemoryStore implements BaseStore
    index_to_docstore_id={},
)
store = InMemoryByteStore()

# Create the ParentDocumentRetriever with the correct parameters
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    # docstore=store,
    byte_store=store,
    child_splitter=child_splitter,
)

# Add documents to the retriever
retriever.add_documents(docs)


In [45]:
# Save vector store
vectorstore.save_local("faiss_index")
# vectorstore = FAISS.load_local("faiss_index", embeddings)

In [50]:
sub_docs = vectorstore.similarity_search("Horror film")
sub_docs

[Document(metadata={'id': 723420, 'doc_id': 'e84efa3c-ed17-456f-84a5-c4f50b1a6bb5'}, page_content='mythology. game act theurgy practicer train fighting evil force'),
 Document(metadata={'id': 723380, 'doc_id': '83e5d221-99cd-4f55-851e-c821c89e3a0d'}, page_content='watch crawl ground kill overall increase health game'),
 Document(metadata={'id': 723420, 'doc_id': 'e84efa3c-ed17-456f-84a5-c4f50b1a6bb5'}, page_content='story mode play. go ahead pick sword fight evil'),
 Document(metadata={'id': 723320, 'doc_id': 'c6934031-e9d1-47de-b293-ea333bd6ea74'}, page_content='f levinasshinobu sanjophilomena pasquinitheo tveitgustav')]

In [None]:
# # Text Splitter (splitting into chunks of 400 characters)
# child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

# # Example embedding model
# d = len(model.encode('Random World Iu Hai Quynh Nhat'))  # Get the dimensionality of the embeddings

# # Initialize the FAISS index
# nlist = 100  # Number of clusters
# quantizer = faiss.IndexFlatL2(d)  # Use flat index as quantizer
# index = faiss.IndexIVFFlat(quantizer, d, nlist)  # Inverted file index

# # Define the FAISS vector store
# vectorstore = FAISS(
#     embedding_function=model,  # Your embedding function
#     index=index,
#     docstore=InMemoryDocstore(),  # Store the documents in memory
#     index_to_docstore_id={},
# )



# # Assuming `docs` is a list of strings (documents)
# # docs = ["Document 1 content", "Document 2 content", "Document 3 content"]  # Example list of documents

# # Initialize the ParentDocumentRetriever
# retriever = ParentDocumentRetriever(
#     vectorstore=vectorstore,
#     docstore=store,
#     child_splitter=child_splitter,
# )

# # Add documents to the retriever
# retriever.add_documents(docs
# , ids=None)
