### 0/ Configuration for Data Preparation Using GoogleAPI and BeautifulSoup

In [2]:
%pip install BeautifulSoup4 google-api-python-client python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from googleapiclient.discovery import build
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

class DocumentationBuilder:
    def __init__(self, query, num_results):
        self.google_api_key = os.getenv('GOOGLE_API_KEY')
        self.search_engine_id = os.getenv('SEARCH_ENGINE_ID')
        self.query = query
        self.num_results = num_results

    def google_search(self):
        service = build("customsearch", "v1", developerKey=self.google_api_key)
        res = service.cse().list(q=self.query, cx=self.search_engine_id, num=self.num_results).execute()
        return [item['link'] for item in res['items']]

    def scrape_html(self, url):
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        return str(soup)

    def build_documentation(self):
        urls = self.google_search()
        raw_documentation = []
        for url in urls:
            html_content = self.scrape_html(url)
            raw_documentation.append({'url': url, 'text': html_content})
        return raw_documentation



In [4]:
# Insert your query here
query = 'Metaverse 2023'
num_results = 3

doc_builder = DocumentationBuilder(query, num_results)
raw_documentation = doc_builder.build_documentation()

In [5]:
# Convert to DataFrame and save, or use directly
df = pd.DataFrame(raw_documentation)
df.to_csv('raw_documentation.csv', index=False)

In [6]:
df

Unnamed: 0,url,text
0,https://hbr.org/2023/05/yes-the-metaverse-is-s...,<!DOCTYPE html>\n\n<!--[if IE 8]>\n<html class...
1,https://www.forbes.com/sites/bernardmarr/2022/...,"<!DOCTYPE html>\n<html lang=""en""><head><link a..."
2,https://www.pwc.com/us/en/tech-effect/innovati...,"\n<!DOCTYPE HTML>\n\n<html lang=""en"">\n<head>\..."


### 1/ Data preparation for LLM Chatbot RAG

In [7]:
from pyspark.sql import SparkSession

# Initialize a SparkSession
spark = SparkSession.builder \
    .appName("SearchAI") \
    .getOrCreate()


your 131072x1 screen size is bogus. expect trouble
23/12/13 10:18:01 WARN Utils: Your hostname, JadenRazer resolves to a loopback address: 127.0.1.1; using 172.19.39.166 instead (on interface eth0)
23/12/13 10:18:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/13 10:18:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [8]:
%pip install -q langchain transformers

Note: you may need to restart the kernel to use updated packages.


In [9]:
!pip list

Package                  Version
------------------------ ------------
aiohttp                  3.9.1
aiosignal                1.3.1
annotated-types          0.6.0
anyio                    4.1.0
asttokens                2.4.1
attrs                    23.1.0
beautifulsoup4           4.12.2
Bottleneck               1.3.5
cachetools               5.3.2
certifi                  2023.11.17
charset-normalizer       3.3.2
comm                     0.1.4
dataclasses-json         0.6.3
dbdemos                  0.3.43
dbsqlclone               0.1.24
debugpy                  1.6.7
decorator                5.1.1
exceptiongroup           1.2.0
executing                2.0.1
filelock                 3.13.1
frozenlist               1.4.0
fsspec                   2023.12.2
google-api-core          2.15.0
google-api-python-client 2.110.0
google-auth              2.25.2
google-auth-httplib2     0.1.1
googleapis-common-protos 1.62.0
greenlet                 3.0.2
httplib2                 0.22.0
huggingfac

#### Splitting HTML pages in smaller chunks

In [10]:
from pyspark.sql.functions import pandas_udf
import pandas as pd
from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

# Assuming df is your pandas DataFrame from the previous scraping step

max_chunk_size = 500
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=max_chunk_size, chunk_overlap=50)
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=[("h2", "header2")])

def split_html_on_h2(html, min_chunk_size = 20, max_chunk_size=500):
  h2_chunks = html_splitter.split_text(html)
  chunks = []
  previous_chunk = ""
  # Merge chunks together to add text before h2 and avoid too small docs.
  for c in h2_chunks:
    # Concat the h2 (note: we could remove the previous chunk to avoid duplicate h2)
    content = c.metadata.get('header2', "") + "\n" + c.page_content
    if len(tokenizer.encode(previous_chunk + content)) <= max_chunk_size/2:
        previous_chunk += content + "\n"
    else:
        chunks.extend(text_splitter.split_text(previous_chunk.strip()))
        previous_chunk = content + "\n"
  if previous_chunk:
      chunks.extend(text_splitter.split_text(previous_chunk.strip()))
  # Discard too small chunks
  return [c for c in chunks if len(tokenizer.encode(c)) > min_chunk_size]

# Register the UDF in Spark
@pandas_udf("array<string>")
def parse_and_split(docs: pd.Series) -> pd.Series:
    return docs.apply(split_html_on_h2)


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [11]:
# Convert the pandas DataFrame to a Spark DataFrame
spark_df = spark.createDataFrame(df)

# Apply the UDF to the 'text' column
chunked_documents = spark_df.withColumn("chunked_text", parse_and_split(spark_df["text"]))

# Now you can work with the chunked documents in your Spark DataFrame

In [12]:
spark_df.show()

                                                                                

+--------------------+--------------------+
|                 url|                text|
+--------------------+--------------------+
|https://hbr.org/2...|<!DOCTYPE html>\n...|
|https://www.forbe...|<!DOCTYPE html>\n...|
|https://www.pwc.c...|\n<!DOCTYPE HTML>...|
+--------------------+--------------------+



In [13]:
chunked_documents

DataFrame[url: string, text: string, chunked_text: array<string>]

In [14]:
chunked_documents.show(2)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax ha

+--------------------+--------------------+--------------------+
|                 url|                text|        chunked_text|
+--------------------+--------------------+--------------------+
|https://hbr.org/2...|<!DOCTYPE html>\n...|[Navigation Menu ...|
|https://www.forbe...|<!DOCTYPE html>\n...|[Subscribe to new...|
+--------------------+--------------------+--------------------+
only showing top 2 rows



                                                                                

In [15]:
%pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [16]:
# Retrieve the first row's HTML content
html_row = spark_df.select("text").limit(1).collect()
html_content = html_row[0]['text']

# Apply the chunking function
chunks = split_html_on_h2(html_content)

In [17]:
display(chunks)

['Navigation Menu  \nSubscribe Sign In Search Menu  \nAccount Menu  \nAccount Menu  \nHi,  \n\xa0Guest  \nClose menu  \nSearch CLEAR  \nSUGGESTED TOPICS  \nExplore HBR  \nLatest The Magazine Ascend Podcasts Video Store Webinars Newsletters  \nPopular Topics  \nManaging Yourself Leadership Strategy Managing Teams Gender Innovation Work-life Balance All Topics  \nFor Subscribers  \nThe Big Idea Data & Visuals Reading Lists Case Selections HBR Learning Subscribe  \nMy Account  \nMy Library Topic Feeds Orders Account Settings Email Preferences Log Out Sign In  \nSubscribe Latest Podcasts Video The Magazine Ascend Store Webinars Newsletters All Topics The Big Idea Data & Visuals Reading Lists Case Selections HBR Learning My Library Account Settings Log Out Sign In  \nYour Cart  \nVisit Our Store  \nYour Shopping Cart is empty.  \nGuest User  \nSubscriber  \nMy Library Topic Feeds Orders Account Settings Email Preferences Log Out  \nReading List  \nReading Lists  \nYou have 1 free articles l

#### Creating embedding endpoint with MLFlow