In [1]:
# You may not need to run this cell, start by skipping it 
!pip install -U pip
!pip install -U --no-warn-script-location langchain langchain-community langchain-openai
# run once and then restart your kernel



In [2]:
import requests
from langchain_community.document_loaders import BSHTMLLoader
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List, Optional
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
import pandas as pd

### This cell may generate a warning message that is ok. 


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
data = requests.get('https://courses.cs.washington.edu/courses/cse163/20wi/files/lectures/L04/bee-movie.txt')
f = open('source.html','w')
f.write(data.text)
f.close()
soup = BSHTMLLoader('source.html').load()


In [4]:

class CharacterTraits(BaseModel):
    """Information about the traits of various characters in the Bee Movie"""
    character: str = Field(description="The name of the character")
    bee: Optional[bool] = Field(description="Whether the character is a bee")
    trait: str = Field(description="The trait of the character")
    proof: str = Field(description="A verbatim quote from the movie that demonstrates the trait")
    good: bool = Field(description="Whether the trait is good or bad")
    confidence: float = Field(description="How confident you are that the character has this trait, from 0 to 1")


class Data(BaseModel):
    traits: List[CharacterTraits]



In [5]:

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert at finding historical events in text.  I have a text that I would like you to analyze.  The text is about the Bee Movie.  Can you extract the historical events from the text?"
            "only extract important characters and their traits"
        ),
        ("human","{text}" )
    ]
)

llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
extract = prompt | llm.with_structured_output(schema=Data, method="function_calling")

## Setup complete lets do the work

from langchain_text_splitters import TokenTextSplitter
textsplit = TokenTextSplitter(
    chunk_size=2000,
    chunk_overlap=30
)

content = textsplit.split_text(soup[0].page_content)
events = extract.batch(
    [{"text":text} for text in content],
    {"max_concurrency": 2}
    )




all_events = []

for i in events:
    all_events.extend(i.traits)
f = open("out.csv",'w')
for event in all_events:
    print(event)
    # Print the results to out.csv
    f.write(f"{event.character},{event.bee},{event.trait},{event.proof},{event.good},{event.confidence}\n") 


character='Barry Benson' bee=True trait='Nonconformist' proof="The bee, of course, flies anyway because bees don't care what humans think is impossible." good=True confidence=0.9
character='Barry Benson' bee=True trait='Adventurous' proof="I'm glad I took a day and hitchhiked around the hive." good=True confidence=0.8
character='Barry Benson' bee=True trait='Curious' proof='You ever think maybe things work a little too well here?' good=True confidence=0.85
character='Adam Flayman' bee=True trait='Conventional' proof="Why would you question anything? We're bees. We're the most perfectly functioning society on Earth." good=False confidence=0.8
character='Janet Benson' bee=True trait='Caring' proof="Very proud. (Rubs Barry's hair)" good=True confidence=0.9
character='Martin Benson' bee=True trait='Proud' proof="Here's the graduate. We're very proud of you, son." good=True confidence=0.85
character='Dean Buzzwell' bee=True trait='Authoritative' proof='Welcome, New Hive City graduating clas