In [1]:
import os
import time 
import wikipedia
from decouple import config, AutoConfig
from langchain_mistralai.chat_models import ChatMistralAI
from langchain.schema import HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.documents import Document
config = AutoConfig(search_path="/home/harry/Chatbot")

In [2]:
from mistralai import Mistral
from typing import List
from langchain_core.embeddings import Embeddings

In [3]:
MISTRAL_API_KEY = config("MISTRAL_API_KEY")
HF_TOKEN = config("HF_TOKEN")
os.environ["HF_TOKEN"] = HF_TOKEN
UPSTASH_VECTOR_REST_URL = config("UPSTASH_VECTOR_REST_URL")
UPSTASH_VECTOR_REST_TOKEN = config("UPSTASH_VECTOR_REST_TOKEN")

In [4]:
from langchain_core.documents import Document

documents = []
cities = ["Tehran, Tehran", "shiraz, fars"]
for city in cities:
    wikipedia_page_result = wikipedia.page(title=city)
    doc = Document(
        page_content=wikipedia_page_result.content,
        metadata={
            "source": f"{wikipedia_page_result.url}",
            "title": city,
        }
    )
    documents.append(doc)

In [5]:
documents[1].metadata

{'source': 'https://en.wikipedia.org/wiki/Shiraz', 'title': 'shiraz, fars'}

In [6]:
len(documents)

2

In [7]:
class MistralEmbeddings(Embeddings):
    def __init__(self, model: str = "mistral-embed", batch_size: int = 32):
        self.model = model
        self.client = Mistral(api_key=MISTRAL_API_KEY)
        self.batch_size = batch_size
        
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        embeddings = []
        for i in range(0, len(texts), self.batch_size):
            batch = texts[i:i+self.batch_size]
            cleaned_batch = [t.replace("\n", " ") for t in batch]
            response = self.client.embeddings.create(
                model=self.model,
                inputs=cleaned_batch
            )
            embeddings.extend([e.embedding for e in response.data])
            time.sleep(5)  
        return embeddings
    
    def embed_query(self, text: str) -> List[float]:
        text = text.replace("\n", " ")
        response = self.client.embeddings.create(
            model=self.model,
            inputs=[text]
        )
        return response.data[0].embedding

In [8]:
embeddings = MistralEmbeddings(batch_size=32)

In [9]:
from langchain_community.vectorstores import UpstashVectorStore

store = UpstashVectorStore(
    embedding=embeddings,
    index_url=UPSTASH_VECTOR_REST_URL,
    index_token=UPSTASH_VECTOR_REST_TOKEN
)

In [10]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, TokenTextSplitter, CharacterTextSplitter

In [11]:
model = ChatMistralAI(api_key=MISTRAL_API_KEY)

In [12]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=0)

In [13]:
docs = text_splitter.split_documents(documents)

In [14]:
len(docs)

935

In [15]:
inserted_vectors = store.add_documents(docs)

In [16]:
result = store.similarity_search_with_score("what is the the the tallest tower in Tehran?", k=2)
for doc, score in result:
    print(f"{doc.metadata} - {score}")

{'source': 'https://en.wikipedia.org/wiki/Tehran', 'title': 'Tehran, Tehran'} - 0.93282855
{'source': 'https://en.wikipedia.org/wiki/Tehran', 'title': 'Tehran, Tehran'} - 0.9215398
