# Try out Embedding
- Faiss
- Chromadb
- Langchain
- https://huggingface.co/BAAI/bge-small-en-v1.5


In [None]:
import pandas as pd
import chromadb

In [None]:
governments = pd.read_json("../government_metadata.json", orient="index")

In [None]:
chroma_client = chromadb.PersistentClient(path="./data/vectorstore.db")

In [None]:
collection = chroma_client.create_collection(name="civ_game")

In [None]:
ids_summaries = (governments.index + "_summary").tolist()
summaries = governments["summary"].tolist()
ids_ft = (governments.index + "_fulltext").tolist()
fulltexts = governments["full_page_content"].tolist()

In [None]:
## Adding data to a collection
collection.add(documents=summaries, ids=ids_summaries)
collection.add(documents=fulltexts, ids=ids_ft)

In [None]:
collection.get("Anarchy_summary")

In [None]:
results = collection.query(query_texts=["What is capitalism?"], n_results=1)

In [None]:
collection.query(query_texts=["Which form of government cares most about resources?"], n_results=2)

In [None]:
## Updating data from a collection
"""
collection.update(
    ids=["id1", "id2", "id3", ...],
    embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],
    metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...],
    documents=["doc1", "doc2", "doc3", ...],
)
"""

In [None]:
## Langchain + Chroma


In [None]:
## Sentence Transformers

In [None]:
## Huggingface

## Try loading documents into langchain
Will use openAI embeddings

In [None]:
from langchain.document_loaders import WikipediaLoader
from langchain.document_loaders import JSONLoader

In [None]:
docs = WikipediaLoader(query="Capitalism", load_max_docs=2).load()

In [None]:
docs

In [None]:
import json
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader


class JSONLoader(BaseLoader):
    """ Custom JSON loader for loading wikipedia data into langchain"""
    def __init__(
        self,
        file_path: Union[str, Path],
        content_key: Optional[str] = None,
        fulltext: bool = False,
        ):
        self.file_path = Path(file_path).resolve()
        self._content_key = content_key
        self.fulltext = fulltext
        
    def load(self) -> List[Document]:
        """Load and return documents from the JSON file."""

        docs=[]
        # Load JSON file
        with open(self.file_path) as file:
            data = json.load(file)

            # Iterate through 'pages'
        for government_name, government_text in data.items():
            base_metadata = {'gov_type': government_name}
            
            summary = government_text['summary']
            summary_metadata = base_metadata.copy()
            summary_metadata['type'] = 'summary'
            docs.append(Document(page_content=summary, metadata=summary_metadata))
            if self.fulltext:
                full_text = government_text['full_page_content']
                full_text_metadata = base_metadata.copy()
                full_text_metadata['type'] = 'fulltext'
                docs.append(Document(page_content=full_text, metadata=full_text_metadata))

        return docs

In [None]:
loader = JSONLoader(
    file_path='../government_metadata.json'
    )
data = loader.load()

In [None]:
data

In [None]:
# Try vectorstore
from datetime import datetime
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.memory import VectorStoreRetrieverMemory
from langchain.chains import ConversationChain
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma

In [None]:
# Create persistent client with langchain and openaiembeddings
db = Chroma.from_documents(data, OpenAIEmbeddings(), persist_directory="../data/vectorstore.db", collection_name="civ_game")

## Example load

In [None]:
# Load
db2 = Chroma(persist_directory="../data/vectorstore.db", embedding_function=OpenAIEmbeddings(), collection_name="civ_game")
query =" What is capitalism"
docs = db2.similarity_search(query)
print(docs[0].page_content)