# **KAUSTUBH GUPTA**
(kaustubh.r.gupta@gmail.com)

In [183]:
!pip install -qU langchain langchain_community langchain_openai langchain_chroma

In [184]:
import os
from langchain_chroma import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers.string import StrOutputParser

## **Document Processing**

PDF Miner: Processes texts, tables and images.

In [185]:
!pip install -qU langchain_community pdfminer.six

In [186]:
from langchain_community.document_loaders import PDFMinerLoader

file_path = "/content/OverviewofRenewableEnergyPowerGenerationandConversion2015-2023.pdf"
loader = PDFMinerLoader(file_path)

In [187]:
docs = loader.load()
print(docs[0])



page_content='See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/372441898

Overview of Renewable Energy Power Generation and Conversion (2015-2023)

Article · July 2023

CITATIONS
24

7 authors, including:

READS
401

Val Hyginus U. Eze

Kampala International University (KIU)

141 PUBLICATIONS   1,114 CITATIONS   

Enerst Edozie

Kampala International University (KIU)

36 PUBLICATIONS   240 CITATIONS   

SEE PROFILE

SEE PROFILE

Kalyankolo Umaru

MUNI University

30 PUBLICATIONS   120 CITATIONS   

SEE PROFILE

Wisdom Onyema Okafor

Continental Transfer Technique

35 PUBLICATIONS   194 CITATIONS   

SEE PROFILE

All content following this page was uploaded by Kiu Publication Extension on 18 July 2023.

The user has requested enhancement of the downloaded file.
EEJOURNALS                                                                                                                                          OPEN ACCESS 
EURASI

Cleaning data

In [188]:
import nltk
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure required NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Splitting data inside the file

In [189]:
text_splitter = CharacterTextSplitter(
    chunk_size = 2100,
    chunk_overlap = 200,
    length_function = len
)

In [190]:
texts = text_splitter.create_documents([doc.page_content for doc in docs])

In [192]:
len(texts)

16

In [193]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [194]:
def remove_punctuation(word):
    """Remove punctuation from words"""
    return re.sub(r'[^\w\s]', '', word) or ''

Normalization Pipeline

In [None]:
def clean_text(text: str) -> str:
    """Cleans text by removing unwanted metadata, special characters, and excessive whitespace."""
    unwanted_patterns = [
        r"© .*?, \d{4}",  # Matches copyright statements like "© Eze et al., 2023"
        r"SEE PROFILE",  # Matches unwanted text fragments
        r"\x0c",  # Removes special characters like form feed
        r"EEJOURNALS",  # Example of journal metadata that may be unnecessary
        r"This is an open access article .*? license",  # Matches open access license statements
        r"Page \| \d+",     # Matches Page Number
        r"\[\d+\]",     # Matches citations - [10]
        r"\[\d+(,\s*\d+)*\]"    # Matches citations - [10, 12]
    ]

    for pattern in unwanted_patterns:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE)

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    """Processes text by tokenizing, lowercasing, removing special characters, stopwords, and lemmatizing."""
    # Tokenize sentences
    sentences = sent_tokenize(text)

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english')) 

    cleaned_sentences = []
    for sentence in sentences:
        # Tokenize words
        words = word_tokenize(sentence)
        # Convert to lowercase
        words = [word.lower() for word in words]
        # Remove punctuation
        words = [remove_punctuation(word) for word in words if remove_punctuation(word)]
        # Remove stopwords
        words = [word for word in words if word not in stop_words]
        # # Lemmatize words
        # words = [lemmatizer.lemmatize(word) for word in words]

        cleaned_sentences.append(" ".join(words))

    return " ".join(cleaned_sentences)
    # return text

In [196]:
for i in range(len(texts)):
  texts[i].page_content = clean_text(texts[i].page_content)

In [197]:
texts

[Document(metadata={}, page_content='see discussions stats author profiles publication https wwwresearchgatenetpublication372441898 overview renewable energy power generation conversion 20152023 article july 2023 citations 24 7 authors including reads 401 val hyginus u eze kampala international university kiu 141 publications 1114 citations enerst edozie kampala international university kiu 36 publications 240 citations kalyankolo umaru muni university 30 publications 120 citations wisdom onyema okafor continental transfer technique 35 publications 194 citations content following page uploaded kiu publication extension 18 july 2023 user requested enhancement downloaded file open access eurasian experiment journal engineering eeje issn 2992409x eeje publications volume 4 issue 1 2023 overview renewable energy power generation conversion 20152023 val hyginus udoka eze1 enerst edozie2 kalyankolo umaru3 okafor wisdom4 ugwu chinyere n5 ogenyi fabian chukwudi6 12department electrical telecom

## **Embedding Formation**

In [198]:
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OpenAI_Key')

Embedding model - `OpenAI`

In [199]:
openai_embedding = OpenAIEmbeddings(
    model = 'text-embedding-3-small'
)

Embedding model - `HuggingFace` <br>
Here, HuggingFace's embedding model has been implemented. Because the model resulted in improved accuracy

In [200]:
!pip install -qU langchain_huggingface

In [201]:
!pip install -qU sentence-transformers

In [202]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [203]:
from langchain_huggingface import HuggingFaceEmbeddings
hf_embedding = HuggingFaceEmbeddings()

Database Formation (Vector DB)

In [231]:
# Initialize Vector Storage
vector_storage = Chroma(
    collection_name = 'Renewable_Energy_Overview_Research_Gate', # If openai_embedding, change to "Renewable_Energy_Overview_Research_Gate_v2"
    embedding_function = hf_embedding
)

In [232]:
# Load the data in DB
storage_id = vector_storage.add_documents(texts)

In [233]:
results = vector_storage.similarity_search(
    'What is the temperature of CSP in between?',
    k = 2
)

In [234]:
for result in results:
  print(f'ID: {result.id} \n CONTENT: {result.page_content} \n---------------------------------------')

ID: 9fee274e-465a-4f9a-8344-88937fddad2a 
 CONTENT: figure 4 renewable source energy generated power 20202023 5 figure 4 showed energy generated difference renewable energy source clearly observed solar leading trend leading renewable energy 2023 solar wind nuclear gas whereas hydro coal diminishing respect year solar photovoltaic brightest prospect among renewable source energy rapid increase make great history energy sector 2050 second wind still performance level solar year 2020 may still among trending one decreased 2021 picked 2022 2023 5 overview solar energy major component solar photovoltaic pv system pv module made solar cell solar cell convert energy photon sunlight electricity mean pv phenomenon found certain type semiconductor material selenium se germanium ge silicon si isolated operation photovoltaic cell produce negligible amount power produce substantial electrical output power solar cell connected series parallel form pv module pv cell connected series increase voltage

## **Setting up Retrivals**

Set up function to convert the `results` data

In [235]:
def format_docs(docs):
  return ' '.join(x.page_content for x in docs)
format_docs(results)

'figure 4 renewable source energy generated power 20202023 5 figure 4 showed energy generated difference renewable energy source clearly observed solar leading trend leading renewable energy 2023 solar wind nuclear gas whereas hydro coal diminishing respect year solar photovoltaic brightest prospect among renewable source energy rapid increase make great history energy sector 2050 second wind still performance level solar year 2020 may still among trending one decreased 2021 picked 2022 2023 5 overview solar energy major component solar photovoltaic pv system pv module made solar cell solar cell convert energy photon sunlight electricity mean pv phenomenon found certain type semiconductor material selenium se germanium ge silicon si isolated operation photovoltaic cell produce negligible amount power produce substantial electrical output power solar cell connected series parallel form pv module pv cell connected series increase voltage output connected parallel increase current output 

Set up retriever

In [236]:
retriever = vector_storage.as_retriever()

LLM instance - `OpenAI`

In [237]:
llm_openai = ChatOpenAI(model = 'gpt-4o-mini')

LLM instance - `HuggingFace`

In [214]:
# from langchain_community.llms.huggingface_hub import HuggingFaceHub

# os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('HF_TOKEN')

# llmHF = HuggingFaceHub(
#     repo_id="memevis/BGG27",
#     task="text-generation"
# )

Prompt instance: Defining answering criteria(s)

In [238]:
template = """
Use the context provided to answer the question at the end. If you don't know the answer, take a guess if you're confident else say I don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer concise.

context: {context}

question: {question}

answer:
"""

In [239]:
custom_template = PromptTemplate.from_template(template)

Here, `OpenAI` is chosen for **LLM**.

In [240]:
rag_chain = (
    {'context': retriever, 'question': RunnablePassthrough()}
    | custom_template
    | llm_openai
    | StrOutputParser()
)

## **Test**

In [241]:
rag_chain.invoke("What is the temperature of CSP in between?")

'The temperature of Concentrating Solar Power (CSP) typically ranges between 400 to 1000 degrees Celsius.'

In [219]:
rag_chain.invoke("who is prime minister Modi?")

"I don't know."

In [220]:
rag_chain.invoke("Kalyankolo Umaru")

'Kalyankolo Umaru is affiliated with Muni University and has published 30 publications with 120 citations. He is one of the authors of a paper on renewable energy power generation and conversion.'

In [221]:
rag_chain.invoke("Who are the associated authors with Kampala International University?")

'The associated authors with Kampala International University are Val Hyginus Udoka Eze, Enerst Edozie, and Kalyankolo Umaru.'

In [222]:
rag_chain.invoke("is the document by researchgate?")

'Yes, the document mentions ResearchGate in the context of publication.'

In [223]:
rag_chain.invoke("Why is Pakistan mentioned? Is India mentioned too?")

'Pakistan is mentioned in the context of solar energy development and the barriers to its implementation, along with policy recommendations. India is also mentioned in relation to the importance of solar energy technologies for rural area development.'

In [224]:
rag_chain.invoke("authors name?")

'The authors mentioned are Val Hyginus U. Eze, Enerst Edozie, Kalyankolo Umaru, Wisdom Onyema Okafor, and others associated with various publications.'