In [None]:
import os

api_key = os.getenv("openai_api_key")

In [1]:
print("hello")

hello


In [None]:
import aiohttp
from bs4 import BeautifulSoup
import pprint
from playwright.async_api import async_playwright
from langchain.chains import create_extraction_chain
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI

from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import Html2TextTransformer
from langchain_community.document_transformers import BeautifulSoupTransformer


In [None]:
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0125", openai_api_key=api_key)

schema1 = {
    "properties": {
        "food": {"type": "string"},
        "price": {"type": "integer"},
        "description": {"type": "string"}
    },
    "required": ["food", "price", "description"]
}
schema2 = {
    "properties": {
        "food category": {"title":"Category", "description": "The main food category in menu", "type": "string"},
        "food item": {"title":"Food name", "description": "The main food item in menu", "type": "string"},
        "food price": {"title": "Price", "description": "The price of each food item", "type": "integer"},
        "description": {"title": "Description", "description": "The description of food item","type": "string"}
    },
    "required": ["food category","food item", "food price", "description"]
}

structured_schema = {
    "properties": {
        "product": {"type": "string"},
        "description": {"type": "string"},
    },
    "required": ["product", "description"],
}

def extract(content: str, schema: dict):
    return create_extraction_chain(schema=schema, llm=llm).run(content)

In [None]:
# from langchain_text_splitters import RecursiveCharacterTextSplitter
def scrape_with_playwright(urls, schema):
    loader = AsyncHtmlLoader(urls)
    docs = loader.load()
    
    html2text = Html2TextTransformer()
    docs_transformed = html2text.transform_documents(docs)
    
    print("Extracting content with LLM")

    # Grab the first 1000 tokens of the site
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=800, chunk_overlap=0
    )
    splits = splitter.split_documents(docs_transformed)

    # Process the first split
    extracted_content = extract(schema=schema, content=splits[0].page_content)
    pprint.pprint(extracted_content)
    return extracted_content

urls = ["https://cafebrazil.com/menu/"]
extracted_content = scrape_with_playwright(urls, schema=schema2)


#######################################################################

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0125", openai_api_key=api_key)

In [None]:
urls = ["https://getglasskin.com/","https://cafebrazil.com/menu/"]
url = ["https://cafebrazil.com/menu", "https://thetownedeli.com/food-menu/"]
url2 = ["https://cafebrazil.com"]

from langchain_community.document_loaders import SeleniumURLLoader
loader = SeleniumURLLoader(urls=url2)
data = loader.load()
# data1 = data[1]
# data2 = data
pprint.pprint(data)

In [None]:
#Alternative loader
from bs4 import BeautifulSoup as Soup
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader

url = "https://cafebrazil.com"
loader = RecursiveUrlLoader(
    url=url, max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
)
docs = loader.load()


In [None]:
pprint.pprint(docs)

In [None]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a world class food menu analyzer. Your job is to extract all menu link from the given {data}"),
    ("user", "{input}")
])

In [None]:

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a world class food menu analyzer. Your job is to extract menu data from the given {docs[0]}"),
    ("user", "{input}")
])

In [None]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

class Menu_Links(BaseModel):
    url_link: str = Field(description="url link of the menu information")
    
class Menu(BaseModel):
    category: str = Field(description="category of the food menu")
    menu_item: str = Field(description="menu item name")
    description: str = Field(description="menu item description with ingredients")
    price: float = Field(description="price of the menu in USD. 'NA' if not available")

parser = JsonOutputParser(pydantic_object=Menu)
# parser = JsonOutputParser(pydantic_object=Menu_Links)

In [None]:
chain1 = prompt | llm | parser

In [None]:
results = chain1.invoke({"input": "Please extract all the url links of the information related to the menu from the given {docs} and store it in dictionary",
                         "data": docs})

In [None]:
pprint.pprint(results)

In [None]:
chain = prompt | llm | parser

In [None]:
results = chain1.invoke({"input": "Please extract the category, list of all the menu items name within that category along with description in single string format and price of the menu item from {docs} in a json format. If price is not available, please leave the field empty",
              "data": docs})

In [None]:
pprint.pprint(results)

In [None]:
pprint.pprint(results)

###############################################################

In [None]:
urls = ["https://getglasskin.com/","https://cafebrazil.com"]
url1 = ["https://cafebrazil.com/menu/"]
url_2 = ["https://getglasskin.com/"]
from langchain_community.document_loaders import SeleniumURLLoader
loader = SeleniumURLLoader(urls=url1)
data = loader.load()
pprint.pprint(data)

In [None]:
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate
)
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field


#building the llm model
llm_model = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0125", openai_api_key=openai_key_2)



In [None]:
#Defining output datatypes within the class and initialize the parser
class Menu(BaseModel):
    menu: list = Field(description="Python list of dictionaries containing category of food, food menu, price of the menu in USD. 'NA' if not available, menu description with ingredients")
    
#Initialize a parser
parser = PydanticOutputParser(pydantic_object=Menu)

In [None]:
human_prompt = HumanMessagePromptTemplate.from_template("{request}\n{format_instructions}")
chat_prompt = ChatPromptTemplate.from_messages([human_prompt])

request = chat_prompt.format_prompt(
    request= "Give me all the details about the menu from the {data}",
    format_instructions = parser.get_format_instructions()
).to_messages()

In [None]:
results = llm_model(request, temperature=0)
result_values = parser.parse(results.content) #menu class object
print(result_values)

In [None]:
import pandas as pd

df = pd.read_csv("Google_all_details.csv") 
res_url = df['Website'].tolist()
print(res_url)

In [None]:
print(df)