# **GENERATIVE AI PROJECT** **-Using OpenAI API & LLMs**

## Importing necessary libraries

In [94]:
import os
import streamlit as st
import pickle
import time
import langchain
import faiss
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [None]:
!pip install numpy==1.23.5


In [104]:
!pip cache purge


Files removed: 0




In [60]:
!pip install faiss-cpu




In [None]:
!pip install -U langchain-openai

In [124]:
# Access the variables
secret_key = 'your_api_key'

In [126]:
os.environ["OPENAI_API_KEY"] = secret_key

In [102]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", 
                 temperature = 0.7, 
                 max_tokens = 500)


## 1) Load data 
**(Web Scraping Using Langchain- Data Extraction)**

In [106]:
loaders = UnstructuredURLLoader(
    urls = [
    "https://www.businessdailyafrica.com",
    "https://nation.africa/kenya/business",
    "https://www.standardmedia.co.ke/business"
]
)
data = loaders.load() 
len(data)

3

## (2) Split data to create chunks

In [50]:

text_splitter = RecursiveCharacterTextSplitter(
    separators= [" "],
    chunk_size=1000,
    chunk_overlap=200
)
docs = text_splitter.split_documents(data)
len(docs)
#chunked_documents = []
#for docs in documents:
    #for chunk in text_splitter.split_documents(doc.page_content):
        #chunked_documents.append(chunk)
#print(len(chunked_documents))

13

In [54]:
from langchain.docstore.document import Document  # Import the Document class
# Assuming 'docs' is the list of Document objects provided
cleaned_docs = []

for doc in docs:
    # Clean page_content by replacing "\n\n" with a space
    cleaned_content = doc.page_content.replace("\n\n", " ").strip()
    
    # Create a new Document object with the cleaned content
    cleaned_doc = Document(
        metadata=doc.metadata,  # Keep the metadata
        page_content=cleaned_content  # Cleaned page content
    )
    
    # Add the cleaned document to the new list
    cleaned_docs.append(cleaned_doc)

# Output the cleaned documents count or inspect the content
print(f"Cleaned {len(cleaned_docs)} documents.")


Cleaned 13 documents.


In [56]:
cleaned_docs[10]

Document(metadata={'source': 'https://www.standardmedia.co.ke/business'}, page_content="20 days ago Banks must not have their way with lending rates Enterprise Premium Top careers with highest risk of job loss revealed By Esther Dianah 2 days ago Trends small business owners need to watch in 2025 By June Yuan 2 days ago Premium Where are alumni as universities struggle? By Mike Kihaki 2 days ago State to publish amended ICT Bill as cybercrime threats rise The government is getting ready to publish the amended draft of the ICT Authority Bill, 2024 for another round of public hearings. Enterprise By Nanjinia Wamuswa 2 days ago Inaugural Africa Summit tipped to attract over Sh1b in investments By Esther Dianah 2 days ago Grants changing the lives of women traders in border town By James Wanzala 9 days ago Premium Shadow of uncertainty stalks poor varsity students By Mike Kihaki 9 days ago Work Life By AFP 19 days ago 'Sick leave detective': How Germany's Marcus Lentz is making billions th

## (3) Create embeddings for these chunks and save them to FAISS index

In [76]:

# Create the embeddings of the chunks using openAIEmbeddings
embeddings = OpenAIEmbeddings()

# Pass the documents and embeddings inorder to create FAISS vector index
vectorindex_openai = FAISS.from_documents(cleaned_docs, embeddings)

In [78]:
print(vectorindex_openai) 

<langchain_community.vectorstores.faiss.FAISS object at 0x00000170FF8430D0>


## **Storing vector index create in local**
**(4) Retrieve similar embeddings for a given question and call LLM to retrieve final answer**

In [80]:
# Confirm the type of vectorindex_openai
print(type(vectorindex_openai))

<class 'langchain_community.vectorstores.faiss.FAISS'>


In [82]:
import faiss
# `vectorindex_openai` is th LangChain FAISS vector store
# 1. Saving the FAISS index

# Save the FAISS index to a file
faiss_index = vectorindex_openai.index  # Extracts the FAISS index from the LangChain wrapper
file_path = "vector1_index.index"
faiss.write_index(faiss_index, file_path)
print(f"FAISS index saved to {file_path}")

# 2. Loading the FAISS index

# Load the FAISS index from the file
loaded_faiss_index = faiss.read_index(file_path)
print(f"FAISS index loaded from {file_path}")

# 3. Recreate the LangChain FAISS vector store with the loaded index
# Note: You should already have the docstore and the index-to-docstore mapping
docstore = vectorindex_openai.docstore  # This is the docstore from the original vector index
index_to_docstore_id = vectorindex_openai.index_to_docstore_id  # Mapping from index to docstore ID

# Recreate the LangChain FAISS vector store with the loaded index
loaded_vectorstore = FAISS(embedding_function=embeddings, index=loaded_faiss_index, docstore=docstore, index_to_docstore_id=index_to_docstore_id)
print("Recreated LangChain FAISS vector store with loaded FAISS index")

# 4. Create the chain using the loaded vector store

# Initialize your LLM (replace with your actual LLM setup)
# Example: llm = YourLLM()  # Replace with your actual LLM initialization

chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=loaded_vectorstore.as_retriever())


FAISS index saved to vector1_index.index
FAISS index loaded from vector1_index.index
Recreated LangChain FAISS vector store with loaded FAISS index


In [84]:
chain



In [43]:
query = "how is business in Kenya from recent news?"
# query = "what are the main features of punch iCNG?"

langchain.debug=True

chain.invoke({"question": query}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "how is business in Kenya from recent news?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Brian Ngugi 2 hrs ago Kenyans give views on tax laws, demand State accountability Business By Juliet Omelo 2 hrs ago Looming shutdown in counties over cash crisis Business By Emmanuel Kipchumba 2 hrs ago Coast sugarcane output rises on good rains Coast-based Kwale International Sugar Company Ltd (Kiscol) has seen increased cane output from its 1,500 contracted out-grower farmers. Business By Philip Mwakio and Patrick Beja 1 hr ago Looming shutdown in counties over cash crisis Busi

{'answer': 'Recent news indicates that Kenyan business is facing challenges, including a looming shutdown in counties over a cash crisis, discussions about falling inflation, and potential implications for the economy. However, there are also positive developments such as the rise in Coast sugarcane output due to good rains.\n',
 'sources': 'https://www.standardmedia.co.ke/business'}