### Step 0: Installation and imports

In [10]:
!pip install datasets faiss-cpu langchain langchain_community sentence_transformers bitsandbytes accelerate langchainhub peft

In [4]:
import sys
from googleapiclient.discovery import build
from peft import AutoPeftModelForCausalLM
import requests
from bs4 import BeautifulSoup
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma, FAISS
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

### Step 1: Enter your Google search API key and Search Engine ID

In [5]:
def get_google_search_results(search_query):

    def google_search(search_term, api_key, cse_id, **kwargs):
        service = build("customsearch", "v1", developerKey=api_key)
        res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
        return res['items']

    GOOGLE_API_KEY = ''
    GOOGLE_SE_ID = ''
    results = google_search(search_query, GOOGLE_API_KEY, GOOGLE_SE_ID, num=3)
    return results

### Step 2: Parsing webpage to get all textual content

Finding the right content for a given query is challenging. Following is a very simple implementation and has the most potential for improvement.

In [8]:
def get_html_content(urls):

    def clean_string(input_string):
        import re
        cleaned_string = re.sub(r'\s+', ' ', input_string).strip()
        unicode_pattern = re.compile('[^\x00-\x7F]+')
        cleaned_string = unicode_pattern.sub('', cleaned_string)
        return cleaned_string

    content = {}
    for url in urls:
      response = requests.get(url)
      html_content = response.text
      soup = BeautifulSoup(html_content, 'html.parser')

      for script in soup(["script", "style"]):
          script.extract()

      text = soup.get_text()

      content[url] = clean_string(text)

    return content

### Step 3: Storing Embeddings

Here we use FAISS to store embeddings

In [9]:
def create_faiss_db(html_content):

  with open('tmp_txt_file.txt', 'w') as f:
    f.write(html_content)
  raw_documents = TextLoader('tmp_txt_file.txt').load()
  text_splitter = RecursiveCharacterTextSplitter()
  documents = text_splitter.split_documents(raw_documents)
  db = FAISS.from_documents(documents, HuggingFaceEmbeddings())

  return db

### Step 4: Loading LLM

Loading Mistral LLM, you can instead use your own huggingface LLM or use OpenAI completions API

In [None]:
def llm():
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16
  )

  model_id = "mistralai/Mistral-7B-Instruct-v0.2"
  model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  tokenizer.pad_token = tokenizer.eos_token
  return model, tokenizer

### Step 6: Querying LLM

We use the simplest possible prompt for querying the LLM in a zero shot way

In [None]:
def llm_output(model, tokenizer, search_query, retrieved_docs):
  PROMPT= """ [INST] You are an assistant for search tasks.
  Use the following pieces of retrieved context to answer the search query.
  If you don't know the answer, or the answer is not in the context, just say that you don't know.
  Use less than 3 sentences and keep the answer concise.
  [INST]
  Search Query: {search_query}
  Context: {context}
  Answer:
  """

  PROMPT = PROMPT.replace('{context}', retrieved_docs)
  PROMPT = PROMPT.replace('{search_query}', search_query)
  encodeds = tokenizer(PROMPT, return_tensors="pt", add_special_tokens=True)
  model_inputs = encodeds.to('cuda')
  generated_ids = model.generate(**model_inputs, max_new_tokens=1000, temperature = 0.1, top_p = 0.95, top_k=40, repetition_penalty=1.2, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  decoded = tokenizer.batch_decode(generated_ids)
  return(decoded[0])

### Step 7: Combining them all

In [None]:
def search(search_query, llm, tokenizer):
  results = get_google_search_results(search_query)
  links = [result['link'] for result in results]
  html_content = '\n'.join(get_html_content(links).values())
  faiss_db = create_faiss_db(html_content)
  retrieved_docs = "\n".join([doc.page_content for doc in faiss_db.similarity_search(search_query)])[:5000]
  llm_out = llm_output(model, tokenizer, search_query, retrieved_docs)
  print(llm_out[llm_out.find("Answer:")+len("Answer:"):-4].strip())
  return results, links, html_content, faiss_db, retrieved_docs, llm_out

### Main Code:

In [None]:
torch.cuda.empty_cache()
model, tokenizer = llm()

### Some search examples:

In [None]:
search_query = "2024 oscar nomination best movie"
results, links, html_content, faiss_db, retrieved_docs, llm_out = search(search_query, model, tokenizer)

5000
<s>  <<SYS>> You are an assistant for search tasks. 
  Use the following pieces of retrieved context to answer the search query. 
  If you don't know the answer, or the answer is not in the context, just say that you don't know. 
  Use three sentences maximum and keep the answer concise.
  <</SYS>> 
  Search Query: 2024 oscar nomination best movie 
  Context: 2024 Oscars Best Picture Predictions 2024 Oscars Best Picture Predictions × Plus Icon Click to expand the Mega Menu Plus Icon Click to Expand Search Input Have a News Tip? Newsletters Switch edition between U.S. Edition Asia Edition Global Edition U.S. Asia Global Variety Log in Account Variety Digital Variety Archives VIP+ Welcome My Account View Variety Archives View VIP+ Variety Digital Login Variety Digital Subscribe Variety Archives Subscribe VIP+ Subscribe Subscribe Log Out Plus Icon Account Plus Icon Variety Digital Variety Archives VIP+ Welcome My Account View Variety Archives View VIP+ Variety Digital Login Variety D

In [None]:
search_query = "stock price of NVIDIA today"
results, links, html_content, faiss_db, retrieved_docs, llm_out = search(search_query, model, tokenizer)

5000
<s>  <<SYS>> You are an assistant for search tasks. 
  Use the following pieces of retrieved context to answer the search query. 
  If you don't know the answer, or the answer is not in the context, just say that you don't know. 
  Use three sentences maximum and keep the answer concise.
  <</SYS>> 
  Search Query: stock price of NVIDIA today 
  Context: NVIDIA Corporation (NVDA) Stock Price, News, Quote & History - Yahoo FinanceHomeMailNewsFinanceSportsEntertainmentSearchMobileMore...Yahoo FinanceSearchSkip to NavigationSkip to Main ContentSkip to Related ContentSign inMailSign in to view your mailFinance HomeWatchlistsMy PortfolioMarketsNewsVideosYahoo Finance PlusScreenersPersonal FinanceCryptoSectorsContact UsWe are experiencing some temporary issues. The market data on this page is currently delayed. Please bear with us as we address this and restore your personalized lists.U.S. markets closedS&P 5005,026.61+28.70(+0.57%)Dow 3038,671.69-54.64(-0.14%)Nasdaq15,990.66+196.95(+1.

In [None]:
search_query = "top news today"
results, links, html_content, faiss_db, retrieved_docs, llm_out = search(search_query, model, tokenizer)

5000
<s>  <<SYS>> You are an assistant for search tasks. 
  Use the following pieces of retrieved context to answer the search query. 
  If you don't know the answer, or the answer is not in the context, just say that you don't know. 
  Use three sentences maximum and keep the answer concise.
  <</SYS>> 
  Search Query: top news today 
  Context: convicted in a child sexual abuse caseFayaz Aziz / ReutersSudan war is ‘world’s largest child displacement crisis’ with millions facing malnourishment, UNICEF saysUkraine frets as leadership shake-up puts a new general in the firing lineHealthClay Hickson for NBC NewsHealth newsHow to get back on track with your New Year’s resolutionsRicardo Rubio / Europa Press via Getty ImagesHealth newsNovo Nordisk settles two lawsuits over copycat versions of Ozempic and WegovyCourtesy IntuitiveHealth newsRobotic device burned a woman's small intestine during surgery, lawsuit allegesHealth newsCDC report finds teens are using drugs — often alone — to ease 