In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
import os
from langchain.document_loaders import DirectoryLoader
from dotenv import load_dotenv
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
import openai
import chromadb

In [21]:
loader = DirectoryLoader('./new_articles/', glob="./*.txt", loader_cls=TextLoader)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
print(texts[1]) 

page_content='Iron Pillar and Uncorrelated Ventures led the round, with participation from existing investors Nexus Venture Partners, Chiratae Ventures and Next47. CEO and founder Nitin Jayakrishnan says that the new capital will be put toward expanding Pando’s global sales, marketing and delivery capabilities.' metadata={'source': 'new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt'}


In [41]:
# print(len(texts))

In [23]:
metadata_list = [doc.metadata for doc in texts]
text_content_list = [doc.page_content for doc in texts]
id_list=id_list = ["doc" + str(i + 1) for i in range(len(texts))]

In [42]:
from chromadb import Documents, EmbeddingFunction, Embeddings
class E5EmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        embeddings = []
        for text in input:
            try:
                from gradio_client import Client
                client = Client("https://tonic-e5.hf.space/--replicas/lg8l1/")
                
                response = client.predict(
                    "ArguAna",
                    text,  
                    api_name="/generate_and_format_embeddings"
                )
                
                # Extracting the embeddings from the response
                embeddings_data = response.get('data', [])
                if embeddings_data:
                    embedding_list = embeddings_data[0].get('embedding', None)
                    # print("Embeddings:", embedding_list)

                    if embedding_list is None:
                        print("Warning: Gradio client returned None.")
                        return None  

                    # Extending the embeddings list with the nested embeddings
                    embeddings.extend(embedding_list)
                else:
                    print("Warning: 'data' not found in response.")
                    return None
            except Exception as e:
                print(f"Error in E5EmbeddingFunction: {e}")
                return None

        return embeddings


# class E5EmbeddingFunction(EmbeddingFunction):
#     # def __init__(self, api_link: str, model_name: str):
#     #     if not api_link:
#     #         raise ValueError("Please provide a api end point.")

#     #     if not model_name:
#     #         raise ValueError("Please provide the model name.")
#     #     self._api_link = api_link
#     #     self._model_name = model_name

    
    

In [129]:
# embedding_function= embedding_functions.OpenAIEmbeddingFunction(
#                 api_key="None",
#                 api_base="https://tonic-e5.hf.space/--replicas/ax7dg/generate_and_format_embeddings",
#                 api_type="None",
#                 api_version="123",
#                 model_name="ClimateFEVER"
#             )

In [43]:
chroma_client=chromadb.Client()

In [44]:
import chromadb.utils.embedding_functions as embedding_functions
embedding_function =E5EmbeddingFunction()

In [45]:
collection_name = "texts"
existing_collections = [collection.name for collection in chroma_client.list_collections()]

if collection_name in existing_collections:
    chroma_client.delete_collection(collection_name)
    print(f"Info: Existing collection '{collection_name}' deleted.")
else:
    print("No collection found")

Info: Existing collection 'texts' deleted.


In [46]:
vector_store = chroma_client.get_or_create_collection(name="texts",
                                                      embedding_function=embedding_function)
vector_store.add(ids=id_list, documents=text_content_list,metadatas=metadata_list)

Loaded as API: https://tonic-e5.hf.space/--replicas/lg8l1/ ✔
Loaded as API: https://tonic-e5.hf.space/--replicas/lg8l1/ ✔
Loaded as API: https://tonic-e5.hf.space/--replicas/lg8l1/ ✔
Loaded as API: https://tonic-e5.hf.space/--replicas/lg8l1/ ✔
Loaded as API: https://tonic-e5.hf.space/--replicas/lg8l1/ ✔


In [47]:
results = vector_store.query(
    query_texts="Who is Nitin Jayakrishnan",
    n_results=2
)
print(results)


Loaded as API: https://tonic-e5.hf.space/--replicas/lg8l1/ ✔
{'ids': [['doc2', 'doc4']], 'distances': [[0.6361709833145142, 0.6401547193527222]], 'metadatas': [[{'source': 'new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt'}, {'source': 'new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt'}]], 'embeddings': None, 'documents': [['Iron Pillar and Uncorrelated Ventures led the round, with participation from existing investors Nexus Venture Partners, Chiratae Ventures and Next47. CEO and founder Nitin Jayakrishnan says that the new capital will be put toward expanding Pando’s global sales, marketing and delivery capabilities.', 'Pando was co-launched by Jayakrishnan and Abhijeet Manohar, who previously worked together at iDelivery, an India-based freight tech marketplace — and their first startup. The two saw firsthand manufacturers, distributors and retailers were struggling with legacy tech and point solutions to understand, o

In [48]:
load_dotenv()
OPENAI_API_KEY= os.getenv("OPENAI_API_KEY")


In [49]:
query="Who is Nitin Jayakrishnan"
openai.api_key = OPENAI_API_KEY
completion = openai.chat.completions.create(
model="gpt-3.5-turbo",
temperature=0,
messages=
[
    {"role": "user",
    "content": 
    f"We have provided context information below. \n"
    f"---------------------\n"
    f"{results}"
    f"\n---------------------\n"
    f"Given this information, please answer the question: {query}"
    }
]
).choices[0].message.content
print(completion)


Nitin Jayakrishnan is the CEO and founder of Pando, an AI-powered supply chain startup.
