## Loading html data from wikipedia and using text parser such as beautifulSoup to extract only the text disregarding all unwanted tags

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
url = "https://en.wikipedia.org/wiki/GPT-4"
response = requests.get(url)

In [None]:
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
text = soup.get_text()

In [None]:
text

'\n\n\n\nGPT-4 - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate\n\n\n\n\n\t\tContribute\n\t\n\nHelpLearn to editCommunity portalRecent changesUpload file\n\n\n\n\nLanguages\n\nLanguage links are at the top of the page across from the title.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\nCreate accountLog in\n\n\n\n\n\nPersonal tools\n\n\n\n\n Create account Log in\n\n\n\nPages for logged out editors learn more\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top)\n\n\n\n\n\n1Background\n\n\n\n\n\n\n\n2Capabilities\n\n\n\nToggle Capabilities subsection\n\n\n\n\n\n2.1Aptitude on standardized tests\n\n\n\n\n\n\n\n2.2Medical applications\n\n\n\n\n\n\n\n\n\

In [None]:
# find the content div
content_div = soup.find('div', {'class': 'mw-parser-output'})

# remove unwanted elements from div
unwanted_tags = ['sup', 'span', 'table', 'ul', 'ol']
for tag in unwanted_tags:
    for match in content_div.findAll(tag):
        match.extract()

print(content_div.get_text())

2023 text-generating language model
"ChatGPT-4" redirects here. For other uses, see GPT.




Generative Pre-trained Transformer 4 (GPT-4) is a multimodal large language model created by OpenAI, and the fourth in its numbered "GPT-n" series of GPT foundation models. It was released on March 14, 2023, and has been made publicly available in a limited form via the chatbot product ChatGPT Plus (a premium version of ChatGPT), and with access to the GPT-4 based version of OpenAI's API being provided via a waitlist. As a transformer based model, GPT-4 was pretrained to predict the next token (using both public data and "data licensed from third-party providers"), and was then fine-tuned with reinforcement learning from human and AI feedback for human alignment and policy compliance.
Observers reported the GPT-4 based version of ChatGPT to be an improvement on the previous (GPT-3.5 based) ChatGPT, with the caveat that GPT-4 retains some of the same problems. Unlike the predecessors, GPT-4 can 

## Converting text to embeddings or chunks inorder for the similarity check -
### Langchain has text splitter that converts text into chunks



In [None]:
!pip install langchain



In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


article_text = content_div.get_text()


text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 100,
    chunk_overlap  = 20,
    length_function = len,
)


texts = text_splitter.create_documents([article_text])

In [None]:
texts[0]

Document(page_content='2023 text-generating language model\n"ChatGPT-4" redirects here. For other uses, see GPT.', metadata={})

In [None]:
texts[1]

Document(page_content='Generative Pre-trained Transformer 4 (GPT-4) is a multimodal large language model created by', metadata={})

In [None]:
texts[2]

Document(page_content='model created by OpenAI, and the fourth in its numbered "GPT-n" series of GPT foundation models. It', metadata={})

In [None]:
texts[3]

Document(page_content='models. It was released on March 14, 2023, and has been made publicly available in a limited form', metadata={})

### Convert text chunks to embeddings
The chunks of texts are converted to a 2d array in vector space (digital data) for the algorithms to understand. Ideally we want to embed the meaning of the words into the vector space so that words with similar groups remain close to each other and vice versa. This is achieved by embedding models such as Word2Vec.

## Using OpenAI's embeddings that has 1536 dimensions


In [None]:
!pip install openai



In [None]:
import openai


print(texts[0])


embedding = openai.Embedding.create(
    input=texts[0].page_content, model="text-embedding-ada-002"
)["data"][0]["embedding"]


len(embedding)

page_content='2023 text-generating language model\n"ChatGPT-4" redirects here. For other uses, see GPT.' metadata={}


1536

In [None]:
import os
import openai

In [None]:
os.environ["OPENAI_API_KEY"] = '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'
openai.api_key = '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'

## Extracting and Embedding prime minister information from wikipedia using OpenAI embeddings and finally checking the cosine similarities between a input prompt and data

In [None]:
import numpy as np
from numpy.linalg import norm
from langchain.text_splitter import RecursiveCharacterTextSplitter
import requests
from bs4 import BeautifulSoup
import pandas as pd
import openai

####################################################################
# load documents
####################################################################
# URL of the Wikipedia page to scrape
url = 'https://en.wikipedia.org/wiki/Prime_Minister_of_the_United_Kingdom'

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find all the text on the page
text = soup.get_text()

####################################################################
# split text
####################################################################
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 100,
    chunk_overlap  = 20,
    length_function = len,
)

texts = text_splitter.create_documents([text])

####################################################################
# calculate embeddings
####################################################################
# create new list with all text chunks
text_chunks=[]

for text in texts:
    text_chunks.append(text.page_content)

df = pd.DataFrame({'text_chunks': text_chunks})

####################################################################
# get embeddings from text-embedding-ada model
####################################################################
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

df['ada_embedding'] = df.text_chunks.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))

####################################################################
# calculate the embeddings for the user's question
####################################################################
users_question = "What is GPT-4?"

question_embedding = get_embedding(text=users_question, model="text-embedding-ada-002")

# create a list to store the calculated cosine similarity
cos_sim = []

for index, row in df.iterrows():
   A = row.ada_embedding
   B = question_embedding

   # calculate the cosine similarity
   cosine = np.dot(A,B)/(norm(A)*norm(B))

   cos_sim.append(cosine)

df["cos_sim"] = cos_sim
df.sort_values(by=["cos_sim"], ascending=False)

Unnamed: 0,text_chunks,ada_embedding,cos_sim
877,Text is available under the Creative Commons A...,"[-0.010654776357114315, 0.003480737330392003, ...",0.783416
9,4Modern premiership\n\n\n\nToggle Modern premi...,"[-0.006249638739973307, 0.00884940754622221, -...",0.773545
508,Parliament of the United Kingdom. p. 4.,"[0.0025355294346809387, -0.015191224403679371,...",0.759674
10,4.2Prime Minister's Office\n\n\n\n\n\n\n\n4.3P...,"[0.014527319930493832, 0.0015217209002003074, ...",0.756310
842,Press Briefing Room\nPartygate,"[-0.04061790555715561, -0.0009513611439615488,...",0.753287
...,...,...,...
333,Prime Minister's Resignation Honours. No incum...,"[-0.007866651751101017, -0.002907899674028158,...",0.662242
543,Cameron's Dean home destroyed in suspected ars...,"[-0.012982831336557865, -0.0035353857092559338...",0.658959
326,other reasons such as ill-health.[39] If the p...,"[0.0023618992418050766, -0.029972640797495842,...",0.655942
468,hold office unless and until they resign. If t...,"[-0.004581723362207413, -0.021399030461907387,...",0.652318


# Testing out the default openai llm that is the Davinci model! (Quite interesting)

In [None]:
from langchain.llms import OpenAI

llm = OpenAI(temperature=0.7)

In [None]:
llm.__dict__

{'cache': None,
 'verbose': False,
 'callbacks': None,
 'callback_manager': None,
 'tags': None,
 'metadata': None,
 'client': openai.api_resources.completion.Completion,
 'model_name': 'text-davinci-003',
 'temperature': 0.7,
 'max_tokens': 256,
 'top_p': 1,
 'frequency_penalty': 0,
 'presence_penalty': 0,
 'n': 1,
 'best_of': 1,
 'model_kwargs': {},
 'openai_api_key': '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',
 'openai_api_base': '',
 'openai_organization': '',
 'openai_proxy': '',
 'batch_size': 20,
 'request_timeout': None,
 'logit_bias': {},
 'max_retries': 6,
 'streaming': False,
 'allowed_special': set(),
 'disallowed_special': 'all',
 'tiktoken_model_name': None}

In [None]:
print(llm('What is the capital of India?'))



The capital of India is New Delhi.


In [None]:
print(llm('What day is 10 of June 2010?'))



The tenth of June 2010 was a Sunday.


## Once the model is loaded, now try to calculate the cosine similarities with respect to a relevant question

In [None]:
####################################################################
# calculate similarities to the user's question
####################################################################
# calcuate the embeddings for the user's question
users_question = "Who is the current Prime Minister of the UK?"
question_embedding = get_embedding(text=users_question, model="text-embedding-ada-002")

In [None]:
# create a list to store the calculated cosine similarity
cos_sim = []

for index, row in df.iterrows():
   A = row.ada_embedding
   B = question_embedding

   # calculate the cosine similiarity
   cosine = np.dot(A,B)/(norm(A)*norm(B))

   cos_sim.append(cosine)

df["cos_sim"] = cos_sim
df.sort_values(by=["cos_sim"], ascending=False)

Unnamed: 0,text_chunks,ada_embedding,cos_sim
425,Deputy Prime Minister of the United Kingdom,"[-0.009941416792571545, -0.025741055607795715,...",0.902032
36,Head of government in the United Kingdom,"[-0.0029565871227532625, -0.004480236675590277...",0.889619
413,List of prime ministers of the United Kingdom,"[-0.009766453877091408, -0.010059129446744919,...",0.886662
0,Prime Minister of the United Kingdom - Wikipedia,"[0.004313100595027208, -0.01079694926738739, -...",0.880024
479,"^ ""Prime Minister"". Gov.UK. Archived from the ...","[-0.019809918478131294, -0.012691930867731571,...",0.871407
...,...,...,...
486,"exists in all the books, the goodness of our c...","[0.02261430025100708, 0.01637815497815609, 0.0...",0.679033
404,incurred in fulfilling public duties in that r...,"[-0.013899651356041431, -0.013168789446353912,...",0.674997
207,"custom, convention, often of slow growth in th...","[0.012066050432622433, -0.0016789701767265797,...",0.673786
30,Tools\n\n\n\n\n\nTools\nmove to sidebar\nhide\...,"[-0.010227522812783718, -0.003667627228423953,...",0.670850


## Generating a prompt template through langchain that specifies the context, user question and asks for an answer! (Getting really interesting now!)


In [None]:
from langchain import PromptTemplate
####################################################################
# build a suitable prompt and send it
####################################################################
# define the LLM you want to use
llm = OpenAI(temperature=1)

# define the context for the prompt by joining the most relevant text chunks
context = ""

for index, row in df[0:50].iterrows():
    context = context + " " + row.text_chunks

# define the prompt template
template = """
Can you answer the question based on the specified context?"

Context sections:
{context}

Question:
{users_question}

Answer:
"""

prompt = PromptTemplate(template=template, input_variables=["context", "users_question"])

# fill the prompt template
prompt_text = prompt.format(context = context, users_question = users_question)
llm(prompt_text)

'The current Prime Minister of the UK is Rishi Sunak.'

## Finally using a vector store database so the entire process is more robust and faster
Storing the embeddings/vectors in a database and comparing the cosine similarity each time is a slow process. Hence a vector db storage is utilised. After storing each queries has to be efficiently seached - achieved by indexing. Index provides better and faster method to calculate similar queries rather than calculating cosine similarities.

In [None]:
!pip install chromadb



In [None]:
!pip install tiktoken



In [None]:
import requests
from bs4 import BeautifulSoup
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader

# URL of the Wikipedia page to scrape
url = 'https://en.wikipedia.org/wiki/Prime_Minister_of_the_United_Kingdom'

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find all the text on the page
text = soup.get_text()
text = text.replace('\n', '')

# Open a new file called 'output.txt' in write mode and store the file object in a variable
with open('output.txt', 'w', encoding='utf-8') as file:
    # Write the string to the file
    file.write(text)

from langchain.text_splitter import RecursiveCharacterTextSplitter

# load the document
with open('./output.txt', encoding='utf-8') as f:
    text = f.read()

# define the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap  = 100,
    length_function = len,
)

texts = text_splitter.create_documents([text])

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# define the embeddings model
embeddings = OpenAIEmbeddings()

# use the text chunks and the embeddings model to fill our vector store
db = Chroma.from_documents(texts, embeddings)

from langchain.llms import OpenAI
from langchain import PromptTemplate

users_question = "Who is the current Prime Minister of the UK?"

# use our vector store to find similar text chunks
results = db.similarity_search(
    query=users_question,
    n_results=5
)

# define the prompt template
template = """
You are a chat bot who loves to help people! Given the following context sections, answer the
question using only the given context. If you are unsure and the answer is not
explicitly writting in the documentation, say "Sorry, I don't know how to help with that."

Context sections:
{context}

Question:
{users_question}

Answer:
"""

prompt = PromptTemplate(template=template, input_variables=["context", "users_question"])

# fill the prompt template
prompt_text = prompt.format(context = results, users_question = users_question)

# ask the defined LLM
llm(prompt_text)

'Rishi Sunak has been the prime minister since 25 October 2022.'