# Enviroment setup

In [1]:
%pip install --upgrade --quiet  langchain langchain-core langchain-community langchain-google-genai \
                                google-generativeai \
                                tiktoken

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m812.8/812.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.6/276.6 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.4/137.4 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.5/87.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

## Import neccessary libraries

In [2]:
import os

from datetime import datetime

from multiprocessing import Process, Queue

from difflib import unified_diff
from IPython.display import display, HTML

import tiktoken

import google.generativeai as genai

from langchain_google_genai import GoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.tools import Tool

from langchain_community.utilities import GoogleSearchAPIWrapper
from langchain_community.document_transformers import Html2TextTransformer
from langchain_community.document_loaders import AsyncHtmlLoader

In [62]:
from google.colab import userdata

os.environ["GOOGLE_CSE_ID"] = userdata.get('GOOGLE_CSE_ID')
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
genai.configure(api_key=GEMINI_API_KEY)

# Utils

In [60]:
def get_search(query:str="", k:int=1): # get the top-k resources with google
    search = GoogleSearchAPIWrapper(k=k)
    def search_results(query):
        return search.results(query, k)
    tool = Tool(
        name="Google Search Snippets",
        description="Search Google for recent results.",
        func=search_results,
    )
    ref_text = tool.run(query)
    if 'Result' not in ref_text[0].keys():
        return ref_text
    else:
        return None

### Test GoogleSearchAPIWrapper

First, you need to set up the proper API keys and environment variables. To set it up, create the GOOGLE_API_KEY in the Google Cloud credential console (https://console.cloud.google.com/apis/credentials) and a GOOGLE_CSE_ID using the Programmable Search Engine (https://programmablesearchengine.google.com/controlpanel/create).

More details:
+ https://python.langchain.com/docs/integrations/tools/google_search

I recommend you guys read this to setup GoogleSearchAPIWrapper:
+ https://stackoverflow.com/questions/37083058/programmatically-searching-google-in-python-using-custom-search

In [66]:
query = "how can i have a girlfriend"
get_search(query=query, k=3)

[{'title': 'What is the best way to find a girlfriend? : r/NoStupidQuestions',
  'link': 'https://www.reddit.com/r/NoStupidQuestions/comments/psbap8/what_is_the_best_way_to_find_a_girlfriend/',
  'snippet': 'Sep 21, 2021 ... Look for some sort of clubs, singles groups, meet ups, any kind of casual gatherings where you can get more comfortable in social situations.'},
 {'title': 'What is the easiest and fastest way to get a girlfriend? How I can get ...',
  'link': 'https://www.quora.com/What-is-the-easiest-and-fastest-way-to-get-a-girlfriend-How-I-can-get-one-I-am-23-years-old-and-never-had-a-girlfriend-I-am-shy-around-women-I-want-to-make-it-happen-Please-give-me-good-advice',
  'snippet': 'May 13, 2020 ... Practice talking to girls. Get comfortable with it. Perhaps even make female friends. That could lead to something. · Practice talking to girls\xa0...'},
 {'title': '[serious] What is it like to have a girlfriend? : r/AskReddit',
  'link': 'https://www.reddit.com/r/AskReddit/commen

In [6]:
def get_page_content(link:str):
    loader = AsyncHtmlLoader([link])
    docs = loader.load()
    html2text = Html2TextTransformer()
    docs_transformed = html2text.transform_documents(docs)
    if len(docs_transformed) > 0:
        return docs_transformed[0].page_content
    else:
        return None

In [7]:
def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def chunk_text_by_sentence(text, chunk_size=2048):
    """Chunk the $text into sentences with less than 2k tokens."""
    sentences = text.split('. ')
    chunked_text = []
    curr_chunk = []
    # Gradually add text fragments, ensuring that each paragraph has less than 2k tokens
    for sentence in sentences:
        if num_tokens_from_string(". ".join(curr_chunk)) + num_tokens_from_string(sentence) + 2 <= chunk_size:
            curr_chunk.append(sentence)
        else:
            chunked_text.append(". ".join(curr_chunk))
            curr_chunk = [sentence]
    # Add last fragment
    if curr_chunk:
        chunked_text.append(". ".join(curr_chunk))
    return chunked_text[0]

def chunk_text_front(text, chunk_size = 2048):
    '''
    get the first `trunk_size` token of text
    '''
    chunked_text = ""
    tokens = num_tokens_from_string(text)
    if tokens < chunk_size:
        return text
    else:
        ratio = float(chunk_size) / tokens
        char_num = int(len(text) * ratio)
        return text[:char_num]

def chunk_texts(text, chunk_size = 2048):
    '''
    trunk the text into n parts, return a list of text
    [text, text, text]
    '''
    tokens = num_tokens_from_string(text)
    if tokens < chunk_size:
        return [text]
    else:
        texts = []
        n = int(tokens/chunk_size) + 1
        # Calculate the length of each section
        part_length = len(text) // n
        # If not divisible, the last part will contain extra characters
        extra = len(text) % n
        parts = []
        start = 0

        for i in range(n):
            # For the first extra parts, one more character is allocated for each part
            end = start + part_length + (1 if i < extra else 0)
            parts.append(text[start:end])
            start = end
        return parts


# Experiment

In [46]:
gemini_system_prompt = f'''
You are Gemini, is a large language model created by Google AI trained on a massive dataset of text and code. As a powerful tool, you can help with various tasks such as writing, research, problem-solving, and translation. As a machine learning model, you are constantly learning and improving.
Knowledge cutoff: 2024-02
Current date: {datetime.now().strftime('%Y-%m-%d')}
'''

def get_draft(question):
  # Getting the draft answer
  draft_prompt = '''
  IMPORTANT:
  Try to answer this question/instruction with step-by-step thoughts and make the answer more structural.
  Use `\n\n` to split the answer into several paragraphs.
  Just respond to the instruction directly. DO NOT add additional explanations or introducement in the answer unless you are asked to.
  '''
  draft_model = genai.GenerativeModel('gemini-pro')
  messages = [
            {
                "role": "user",
                "parts": [f"{gemini_system_prompt}\n{question}" + draft_prompt]
            }
        ]
  response = draft_model.generate_content(messages,
                                    generation_config=genai.types.GenerationConfig(
                                        temperature=1.0,
                                    ))
  try:
    result = response.text
    return result
  except Exception as e:
    print(f'{type(e).__name__}: {e}')
    return None

In [47]:
question = "What is an apple?"
sample_ans = get_draft(question)
sample_ans

'1. **Identify the task:** The task is to define "apple".\n2. **Retrieve relevant knowledge:** As a large language model, I have comprehensive knowledge about apples.\n3. **Organize the knowledge:**\n   - Apples are fruits.\n   - They typically have a round shape and red skin, but can vary in color and shape.\n   - They are crunchy and juicy, with a sweet or tart flavor.\n   - They have a core with seeds.\n   - They are a good source of fiber and vitamins.'

In [9]:
def split_draft(draft, split_char = '\n\n'):
  # Split the draft into multiple paragraphs
  # split_char: '\n\n'
  draft_paragraphs = draft.split(split_char)
  # print(f"The draft answer has {len(draft_paragraphs)}")
  return draft_paragraphs

In [51]:
def get_query(question, answer):
  query_prompt = '''
I want to verify the content correctness of the given question, especially the last sentences.
Please summarize the content with the corresponding question.
This summarization will be used as a query to search with Bing search engine.
The query should be short but need to be specific to promise Bing can find related knowledge or pages.
You can also use search syntax to make the query short and clear enough for the search engine to find relevant language data.
Try to make the query as relevant as possible to the last few sentences in the content.
**IMPORTANT**
Just output the query directly. DO NOT add additional explanations or introducement in the answer unless you are asked to.
'''
  query_model = genai.GenerativeModel('gemini-pro')
  messages=[
      {
          "role": "user",
          "parts": f"{gemini_system_prompt}\n##Question: {question}\n\n##Content: {answer}\n\n##Instruction: {query_prompt}"
      }
  ]
  query = query_model.generate_content(messages,
                                    generation_config=genai.types.GenerationConfig(
                                        temperature=1.0,
                                        top_k=3,
                                        top_p=0.8
                                    ))
  try:
    result = query.text
    return result
  except Exception as e:
    print(f'{type(e).__name__}: {e}')
    return None

In [50]:
get_query(question, sample_ans)

'Apple nutritional value'

In [11]:
def get_content(query):
  res = get_search(query, 1)
  if not res:
      print(">>> No good Google Search Result was found")
      return None
  search_results = res[0]
  link = search_results['link'] # title, snippet
  res = get_page_content(link)
  if not res:
      print(f">>> No content was found in {link}")
      return None
  retrieved_text = res
  trunked_texts = chunk_texts(retrieved_text, 1500)
  trunked_texts = [trunked_text.replace('\n', " ") for trunked_text in trunked_texts]
  return trunked_texts


In [52]:
def get_revise_answer(question, answer, content):
  revise_prompt = '''
I want to revise the answer according to retrieved related text of the question in WIKI pages.
You need to check whether the answer is correct.
If you find some errors in the answer, revise the answer to make it better.
If you find some necessary details are ignored, add it to make the answer more plausible according to the related text.
If you find the answer is right and do not need to add more details, just output the original answer directly.
**IMPORTANT**
Try to keep the structure (multiple paragraphs with its subtitles) in the revised answer and make it more structual for understanding.
Split the paragraphs with `\n\n` characters.
Just output the revised answer directly. DO NOT add additional explanations or annoucement in the revised answer unless you are asked to.
'''
  revise_model = genai.GenerativeModel('gemini-pro')
  messages=[
          {
              "role": "user",
              "parts": f"{gemini_system_prompt}\n##Existing Text in Wiki Web: {content}\n\n##Question: {question}\n\n##Answer: {answer}\n\n##Instruction: {revise_prompt}"
          }
      ]
  revised_answer = revise_model.generate_content(messages,
                                    generation_config=genai.types.GenerationConfig(
                                        temperature=1.0,
                                        top_k=3,
                                        top_p=0.8
                                    ))
  try:
    result = revised_answer.text
    return result
  except Exception as e:
    print(f'{type(e).__name__}: {e}')
    return None

In [53]:
def get_query_wrapper(q, question, answer):
    result = get_query(question, answer)
    q.put(result)

def get_content_wrapper(q, query):
    result = get_content(query)
    q.put(result)

def get_revise_answer_wrapper(q, question, answer, content):
    result = get_revise_answer(question, answer, content)
    q.put(result)

from multiprocessing import Process, Queue
def run_with_timeout(func, timeout, *args, **kwargs):
    q = Queue()  # Create a Queue object for interprocess communication
    # Create a process to execute the passed in function, passing the Queue and other *args and **kwargs as parameters
    p = Process(target=func, args=(q, *args), kwargs=kwargs)
    p.start()
    # Wait for process to complete or timeout
    p.join(timeout)
    if p.is_alive():
        print(f"{datetime.now()} [INFO] The execution of function {str(func)} has timed out ({timeout}s), terminating the process...")
        p.terminate()  # Terminate process
        p.join()  # Make sure the process has been terminated
        result = None  # In case of timeout, we have no results
    else:
        print(f"{datetime.now()} [INFO] Function {str(func)} execution completed successfully")
        result = q.get()  # Get results from queue
    return result

In [54]:
def generate_diff_html(text1, text2):
    diff = unified_diff(text1.splitlines(keepends=True),
                        text2.splitlines(keepends=True),
                        fromfile='text1', tofile='text2')

    diff_html = ""
    for line in diff:
        if line.startswith('+'):
            diff_html += f"{line.rstrip()}"
        elif line.startswith('-'):
            diff_html += f"{line.rstrip()}"
        elif line.startswith('@'):
            diff_html += f"{line.rstrip()}"
        else:
            diff_html += f"{line.rstrip()}"
    return diff_html

In [55]:
# RAT Function
newline_char = '\n'

def rat(question):
    print(f"{datetime.now()} [INFO] Get draft...")
    draft = get_draft(question)
    print(f"{datetime.now()} [INFO] Return to draft")
    print(f"##################### DRAFT #######################")
    print(draft)
    print(f"#####################  END  #######################")

    print(f"{datetime.now()} [INFO] Work on drafts...")
    draft_paragraphs = split_draft(draft)
    print(f"{datetime.now()} [INFO] The draft is divided into {len(draft_paragraphs)} parts")
    answer = ""
    for i, p in enumerate(draft_paragraphs):
        print(str(i)*80)
        print(f"{datetime.now()} [INFO] Modify section {i+1}/{len(draft_paragraphs)}...")
        answer = answer + '\n\n' + p
        # print(f"[{i}/{len(draft_paragraphs)}] Original Answer:\n{answer.replace(newline_char, ' ')}")

        # query = get_query(question, answer)
        print(f"{datetime.now()} [INFO] Generate corresponding Query...")
        res = run_with_timeout(get_query_wrapper, 3, question, answer)
        if not res:
            print(f"{datetime.now()} [INFO] Skip next steps...")
            continue
        else:
            query = res
        print(f">>> {i}/{len(draft_paragraphs)} Query: {query.replace(newline_char, ' ')}")

        print(f"{datetime.now()} [INFO] Get web content...")
        # content = get_content(query)
        res = run_with_timeout(get_content_wrapper, 5, query)
        if not res:
            print(f"{datetime.now()} [INFO] Skip next steps...")
            continue
        else:
            content = res

        for j, c in enumerate(content):
            if  j > 2:
                break
            print(f"{datetime.now()} [INFO] Modify the corresponding answer according to the content of the web page...[{j}/{min(len(content),3)}]")
            # answer = get_revise_answer(question, answer, c)
            res = run_with_timeout(get_revise_answer_wrapper, 10, question, answer, c)
            if not res:
                print(f"{datetime.now()} [INFO] Skip next steps...")
                continue
            else:
                diff_html = generate_diff_html(answer, res)
                display(HTML(diff_html))
                answer = res
            print(f"{datetime.now()} [INFO] Answer modification completed[{j}/{min(len(content),3)}]")
        # print(f"[{i}/{len(draft_paragraphs)}] REVISED ANSWER:\n {answer.replace(newline_char, ' ')}")
        # print()
    return draft, answer
    # return answer



In [67]:
%%time

question = "How can I get a girlfriend?"
draft, answer = rat(question)

2024-04-03 06:40:15.917650 [INFO] Get draft...
2024-04-03 06:40:21.867570 [INFO] Return to draft
##################### DRAFT #######################
1. **Be yourself.** This is the most important tip for any aspect of finding a girlfriend. People can tell when you're being fake, so just be yourself and let your personality shine through.
2. **Put yourself out there.** You can't meet someone if you're always sitting at home. Join a club, volunteer, or take a class. The more people you meet, the more likely you are to find someone you click with.
3. **Be confident.** Confidence is attractive, so work on building your self-esteem. Stand up straight, make eye contact, and smile. People are more likely to be drawn to you if you seem confident and approachable.
4. **Be a good listener.** When you're talking to someone, really listen to what they're saying. Ask questions and show that you're interested in what they have to say. People are more likely to want to spend time with you if they fee

Fetching pages:   0%|          | 0/1 [00:00<?, ?it/s]

2024-04-03 06:40:28.921739 [INFO] The execution of function <function get_content_wrapper at 0x79a92c4d7b50> has timed out (5s), terminating the process...
2024-04-03 06:40:28.929401 [INFO] Skip next steps...
CPU times: user 136 ms, sys: 34.7 ms, total: 171 ms
Wall time: 13 s


In [68]:
answer

"\n\n1. **Be yourself.** This is the most important tip for any aspect of finding a girlfriend. People can tell when you're being fake, so just be yourself and let your personality shine through.\n2. **Put yourself out there.** You can't meet someone if you're always sitting at home. Join a club, volunteer, or take a class. The more people you meet, the more likely you are to find someone you click with.\n3. **Be confident.** Confidence is attractive, so work on building your self-esteem. Stand up straight, make eye contact, and smile. People are more likely to be drawn to you if you seem confident and approachable.\n4. **Be a good listener.** When you're talking to someone, really listen to what they're saying. Ask questions and show that you're interested in what they have to say. People are more likely to want to spend time with you if they feel like you're genuinely interested in them.\n5. **Be respectful.** Treat others the way you want to be treated. Be polite and respectful, eve