# Recreating Bing Chatbot

## Workflow

In [None]:
"""The user query is used to extract relevant articles using a search engine (e.g. Bing or Google Search), which are then split into chunks. We then compute the embeddings of each chunk, rank them by cosine similarity with respect to the embedding of the query, and put the most relevant chunks into a prompt to generate the final answer, while also keeping track of the sources."""

### Getting the search results

In [3]:
from dotenv import load_dotenv
import os

load_dotenv('C:/Users/dell/OneDrive/Documents/LangChain and Vector database/.env')

google_api_key = os.getenv("GOOGLE_API_KEY_1")
search_engine_id = os.getenv("GOOGLE_CSE_ID")

os.environ["GOOGLE_API_KEY"] = google_api_key

In [5]:
from langchain.tools import Tool
from langchain.utilities import GoogleSearchAPIWrapper

search = GoogleSearchAPIWrapper()
TOP_N_RESULTS = 10      

def get_top_n_results(query):
    return search.results(query, TOP_N_RESULTS)

tool = Tool(
    name='Google Search',
    description="Search google based on the given query",
    func= get_top_n_results
)

query = "Who is the current world heavyweight champion of WWE?"

results = tool.run(query)

for result in results:
    print(result['title'])
    print(result['link'])
    print(result['snippet'])
    print('*' * 45)

World Heavyweight Championship | WWE
https://www.wwe.com/classics/titlehistory/world-heavyweight-championship
The new World Heavyweight Championship was announced by Triple H heading into the 2023 WWE Draft. A tournament was held to crown the new champion culminating ...
*********************************************
World Heavyweight Championship (WWE) - Wikipedia
https://en.wikipedia.org/wiki/World_Heavyweight_Championship_(WWE)
The current champion is Seth "Freakin" Rollins, who defeated AJ Styles in a tournament final at Night of Champions on May 27, 2023, to become the inaugural ...
*********************************************
World Heavyweight Title | WWE
https://www.wwe.com/classics/titlehistory/world-heavyweight-title
The World Heavyweight Championship was brought to Raw by former General Manager Eric Bischoff after WWE Champion Brock Lesnar became exclusive to SmackDown.
*********************************************
WWE Championship - Wikipedia
https://en.wikipedia.org/wiki/WW

### Scraping the content from the links

In [17]:
import newspaper

pages_content = []

for result in results:
    try:
        article = newspaper.Article(result['link'])
        article.download()
        article.parse()
        if len(article.text)>0:
            pages_content.append({"url": result["link"], "text": article.text})

    except:
        continue

In [18]:
pages_content

[{'url': 'https://www.wwe.com/classics/titlehistory/world-heavyweight-championship',
  'text': 'The new World Heavyweight Championship was announced by Triple H heading into the 2023 WWE Draft. A tournament was held to crown the new champion culminating with Seth" Freakin" Rollins defeating AJ Styles at WWE Night of Champions 2023.\n\nThe title was designed with unique features that include Three Lions to symbolize the McMahon Family crest, a Crown to pay homage to Bruno Samartino, an Eagle as a callback to the original Winged Eagle design of the WWE Championship, diamonds, filagree and rope trim.'},
 {'url': 'https://en.wikipedia.org/wiki/World_Heavyweight_Championship_(WWE)',
  'text': 'Men\'s professional wrestling championship\n\nWorld Heavyweight Championship The World Heavyweight Championship belt with default side plates Details Promotion WWE Brand Raw Date established April 24, 2023 Current champion(s) Seth "Freakin" Rollins Date won May 27, 2023 Statistics First champion(s) Se

### Processing the search results

In [19]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 3000, chunk_overlap = 100)

docs = []

for d in pages_content:
    chunks = text_splitter.split_text(d['text'])
    for chunk in chunks:
        new_doc = Document(page_content=chunk, metadata={'source': d['url']})
        docs.append(new_doc)

### Creating the embeddings and checking similarity

In [22]:
from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001')

docs_embeddings = embeddings.embed_documents([doc.page_content for doc in docs])
query_embeddings = embeddings.embed_query(query)

In [32]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def get_top_k_indices(list_of_docs_vectors, query_vector, top_k):
    list_of_docs_vectors = np.array(list_of_docs_vectors)
    query_vector = np.array(query_vector)

    similarities = cosine_similarity(query_vector.reshape(1, -1), list_of_docs_vectors).flatten()
   
    sorted_indices = np.argsort(similarities)[::-1]

    top_k_indices = sorted_indices[:top_k]
    return top_k_indices


top_k = 2
best_indexes = get_top_k_indices(docs_embeddings, query_embeddings, top_k)
best_k_documents = [doc for i, doc in enumerate(docs) if i in best_indexes]

In [33]:
best_k_documents

[Document(page_content='The World Heavyweight Championship was brought to Raw by former General Manager Eric Bischoff after WWE Champion Brock Lesnar became exclusive to SmackDown.\n\nBischoff awarded the championship to Triple H, but The Game had to defend it against his best friend, Ric Flair. Triple H was successful and has held the championship on five different occasions. Other titleholders of this prestigious championship include Shawn Michaels, The Undertaker and Rey Mysterio.', metadata={'source': 'https://www.wwe.com/classics/titlehistory/world-heavyweight-title'}),
 Document(page_content='Main roster\n\nFor the men\'s division, Raw features a primary and secondary championship, while SmackDown features two jointly held and defended primary titles (promoted as an undisputed championship) and a secondary title. There are also two jointly held and defended tag team championships—one representing each brand—promoted as an undisputed tag team championship open to both Raw and Smac

### Chain with Source

In [35]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain_google_genai import GoogleGenerativeAI

chain = load_qa_with_sources_chain(llm=GoogleGenerativeAI(model='gemini-pro', temperature=0),chain_type='stuff')

response = chain({'input_documents': best_k_documents, "question": query}, return_only_outputs=True)

response_text, response_sources = response["output_text"].split("SOURCES:")
response_text = response_text.strip()
response_sources = response_sources.strip()

print(f"Answer: {response_text}")
print(f"Sources: {response_sources}")

Answer: Seth "Freakin" Rollins is the current world heavyweight champion of WWE.
Sources: https://en.wikipedia.org/wiki/List_of_current_champions_in_WWE
