In [1]:
#############################################################
# # Document Loading
#############################################################

In [2]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']


In [3]:

#############################################################
# 1. PDFs
# 
# Let's load a PDF [transcript]
# (https://see.stanford.edu/materials/aimlcs229/
# transcripts/MachineLearning-Lecture01.pdf) 
# from Andrew Ng's famous CS229 course! 
# - These documents are the result of automated transcription 
#   so words and sentences are sometimes split unexpectedly.
#############################################################



In [4]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader(
     "docs/cs229_lectures/sfbu-2024-2025-university-catalog.pdf")
pages = loader.load()

len(pages)

# In[ ]:

page = pages[0]

# In[ ]:

print(page.page_content[0:500])

# In[ ]:

page.metadata


 
 
 
 
 
 
  



{'source': 'docs/cs229_lectures/sfbu-2024-2025-university-catalog.pdf',
 'page': 0}

In [5]:
#############################################################
# 2. YouTube
#############################################################


In [6]:
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

# **Note**: This can take several minutes to complete.

# In[ ]:

url="https://www.youtube.com/watch?v=kuZNIvdwnMc"

save_dir="docs/youtube/"

loader = GenericLoader(
    YoutubeAudioLoader([url],save_dir),
    OpenAIWhisperParser()
)

docs = loader.load()

# In[ ]:

docs[0].page_content[0:500]

[youtube] Extracting URL: https://www.youtube.com/watch?v=kuZNIvdwnMc
[youtube] kuZNIvdwnMc: Downloading webpage
[youtube] kuZNIvdwnMc: Downloading ios player API JSON
[youtube] kuZNIvdwnMc: Downloading mweb player API JSON
[youtube] kuZNIvdwnMc: Downloading m3u8 information
[info] kuZNIvdwnMc: Downloading 1 format(s): 140
[download] Destination: docs/youtube//San Francisco Bay University MBA Student Spotlight： John Odebode.m4a
[download] 100% of   10.20MiB in 00:00:00 at 19.89MiB/s    
[FixupM4a] Correcting container of "docs/youtube//San Francisco Bay University MBA Student Spotlight： John Odebode.m4a"
[ExtractAudio] Not converting audio docs/youtube//San Francisco Bay University MBA Student Spotlight： John Odebode.m4a; file is already in target format m4a
Transcribing part 1!


"My name is John, John Odebode. I am studying for an MBA program here at SFBU. It's my final trimester at SFBU and I will be graduating in two weeks. I am from Nigeria. I studied at the University of Lagos for my first degree in philosophy. I also studied for my first master's degree in philosophy as well at the same university. I have been practicing within the supply chain industry for the past six years. I have spent the most part of my career at ExxonMobil and I recently completed a six-month"

In [7]:
#############################################################
# 3. URLs
#############################################################


In [8]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://www.sfbu.edu/admissions/student-health-insurance")

#loader = WebBaseLoader("https://www.sfbu.edu/contact-us")

# In[ ]:


docs = loader.load()


# In[ ]:


print(docs[0].page_content[0:500])

USER_AGENT environment variable not set, consider setting it to identify your requests.





















Page not found | San Francisco Bay University









        Skip to main content
      





















          San Francisco Bay University
        




Header Action Navigation





Visit


Apply 


Online store



























Search










Header Action Navigation





Visit


Apply 


Online store




Mega Menu


Why We're Here








Our CampusStrategic Plan






Our Leadership






Our Glossary of Terms











Learning & Teaching








U
