# Website Information Extraction
* [Web Scraping](https://python.langchain.com/docs/use_cases/web_scraping/#scraping-with-extraction)
* [Extraction](https://python.langchain.com/docs/use_cases/extraction)
* [KOR](https://www.youtube.com/watch?v=SW1ZdqH0rRQ&list=PL8motc6AQftk1Bs42EW45kwYbyJ4jOdiZ&index=25)

from langchain.chat_models import ChatOpenAI
from langchain.chains import create_extraction_chain


In [2]:
import sys
sys.path.append("..") # Adds higher directory to python module path
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv())

In [3]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_extraction_chain



In [34]:
# Schema
simple_schema = {
    "properties": {
        "university name": {"type": "string"},
        "program name": {"type": "string"},
        "application deadline": {"type": "string"},
    },
    "required": ["university name", "program name", "application deadline"],
}


In [4]:
urls_list = ["https://www.cit.tum.de/en/cit/studies/degree-programs/master-informatics/"]

from langchain.document_loaders import UnstructuredURLLoader
loaders = UnstructuredURLLoader(urls=urls_list)
data = loaders.load()

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, 
                                                                chunk_overlap=200)
splits = splitter.split_documents(data)

In [6]:
from langchain.chat_models import ChatOpenAI
chat = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")

In [38]:
def extract(content: str, schema: dict):
    return create_extraction_chain(schema=schema, llm=llm).run(content)

In [39]:
import pprint
# Process the first split 
extracted_content = extract(schema=simple_schema, content=splits[0].page_content)
pprint.pprint(extracted_content)


[{'application deadline': '01 February - 31 May (Winter semester), 01 '
                          'September - 30 November (Summer semester)',
  'program name': 'Master Informatics',
  'university name': 'Technical University of Munich'}]


## Pydantic data

In [7]:
from typing import Optional, List
from pydantic import BaseModel, Field
from langchain.chains import create_extraction_chain_pydantic
from langchain.output_parsers import PydanticOutputParser


In [32]:
class simpleProgramInfo(BaseModel):
    university_name: str
    program_name: str
    application_deadline: str
    degree: str

    



In [12]:
chain = create_extraction_chain_pydantic(pydantic_schema=simpleProgramInfo, llm=chat)
output = chain.run(splits[0].page_content)

In [13]:
output

[simpleProgramInfo(university_name='Technical University of Munich', program_name='Master Informatics', application_deadline='01 February - 31 May (Winter semester), 01 September - 30 November (Summer semester)', degree='Master of Science (M.Sc.)', country='Germany', semester='Summer or Winter semester', language='English', uni_assist="Preliminary documentation required for applicants with a Bachelor's degree from outside the EU/EEA", toefl='Proof of English language proficiency', ielts='Proof of English language proficiency')]

In [18]:
class ProgramInfo(BaseModel):
    university_name: str                = Field(description='This is the name of the university')
    program_name: str                   = Field(description='This is the name of the program')
    degree: str                         = Field(description='This describes the type of the degree whether this is a B.A, B.S, M.A, M.Sc, or others')
    country: Optional[str]                         = Field(description='This is the country of this university')
    semester: str                       = Field(description='Thsi describe does this program has winster or summer semester, or both. Please output WS/SS/Both')
    language: str                       = Field(description='This describe the official teaching language for this program')
    application_deadline: str           = Field(description="This is the applicaiton deadline for applying this program")
    uni_assist: str                     = Field(description="This describe whether applying this program requires uni-assist process e.g. VPD")
    toefl: str                          = Field(description="This is the required TOEFL score for applying this program")
    ielts: str                          = Field(description="This is the required IELTS score for applying this program")

In [19]:
# Extraction
chain = create_extraction_chain_pydantic(pydantic_schema=ProgramInfo, llm=chat)

In [20]:
output = chain.run(splits[0].page_content)

In [21]:
print(output)

[ProgramInfo(university_name='Technical University of Munich', program_name='Master Informatics', degree='Master of Science (M.Sc.)', country='Germany', semester='Summer or Winter semester', language='English', application_deadline='01 February - 31 May (Winter semester), 01 September - 30 November (Summer semester)', uni_assist="Required for applicants with a Bachelor's degree from outside the EU/EEA", toefl='Proof of English language proficiency', ielts='Proof of English language proficiency')]


# LLM Chain

In [22]:
from langchain.prompts import PromptTemplate

In [23]:
from langchain.llms import OpenAI
llm = OpenAI(temperature=0)

In [44]:
parser = PydanticOutputParser(pydantic_object=simpleProgramInfo)


In [45]:
# Prompt
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)


In [46]:
_input = prompt.format_prompt(query=splits[0].page_content)

In [50]:
# Here you cannot use the "chat model"
output = llm(_input.to_string())

In [48]:
parser.parse(output)

simpleProgramInfo(university_name='Technical University of Munich', program_name='Master Informatics', application_deadline='Winter semester: 01 February - 31 May; Summer semester: 01 September - 30 November', degree='Master of Science (M.Sc.)')

# KOR
https://www.youtube.com/watch?v=SW1ZdqH0rRQ&list=PL8motc6AQftk1Bs42EW45kwYbyJ4jOdiZ&index=25