In [1]:
#############################################################
# # Document Loading
#############################################################

In [2]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']


In [3]:

#############################################################
# 1. PDFs
# 
# Let's load a PDF [transcript]
# (https://see.stanford.edu/materials/aimlcs229/
# transcripts/MachineLearning-Lecture01.pdf) 
# from Andrew Ng's famous CS229 course! 
# - These documents are the result of automated transcription 
#   so words and sentences are sometimes split unexpectedly.
#############################################################



In [4]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader(
     "docs/cs229_lectures/MachineLearning-Lecture01.pdf")
pages = loader.load()

len(pages)

# In[ ]:

page = pages[0]

# In[ ]:

print(page.page_content[0:500])

# In[ ]:

page.metadata


MachineLearning-Lecture01  
Instructor (Andrew Ng): Okay. Good morning. Welcome to CS229, the machine 
learning class. So what I wanna do today is just spend a little time going over the logistics 
of the class, and then we'll start to talk a bit about machine learning.  
By way of introduction, my name's Andrew Ng and I'll be instructor for this class. And so 
I personally work in machine learning, and I've worked on it for about 15 years now, and 
I actually think that machine learning is the 


{'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf', 'page': 0}

In [5]:
#############################################################
# 2. YouTube
#############################################################


In [6]:
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

# **Note**: This can take several minutes to complete.

# In[ ]:

url="https://www.youtube.com/watch?v=kuZNIvdwnMc"

save_dir="docs/youtube/"

loader = GenericLoader(
    YoutubeAudioLoader([url],save_dir),
    OpenAIWhisperParser()
)

docs = loader.load()

# In[ ]:

docs[0].page_content[0:500]

[youtube] Extracting URL: https://www.youtube.com/watch?v=kuZNIvdwnMc
[youtube] kuZNIvdwnMc: Downloading webpage
[youtube] kuZNIvdwnMc: Downloading ios player API JSON
[youtube] kuZNIvdwnMc: Downloading mweb player API JSON
[youtube] kuZNIvdwnMc: Downloading player dad5a960
[youtube] kuZNIvdwnMc: Downloading m3u8 information
[info] kuZNIvdwnMc: Downloading 1 format(s): 140
[download] docs/youtube//San Francisco Bay University MBA Student Spotlight： John Odebode.m4a has already been downloaded
[download] 100% of   10.20MiB
[ExtractAudio] Not converting audio docs/youtube//San Francisco Bay University MBA Student Spotlight： John Odebode.m4a; file is already in target format m4a
Transcribing part 1!
Transcribing part 2!
Transcribing part 3!
Transcribing part 4!
Transcribing part 1!


"Welcome to CS229 Machine Learning. Uh, some of you know that this is a class that's taught at Stanford for a long time. And this is often the class that, um, I most look forward to teaching each year because this is where we've helped, I think, several generations of Stanford students become experts in machine learning, got- built many of their products and services and startups that I'm sure, many of you or probably all of you are using, uh, uh, today. Um, so what I want to do today was spend s"

In [7]:
#############################################################
# 3. URLs
#############################################################


In [10]:
from langchain.document_loaders import WebBaseLoader

#loader = WebBaseLoader("https://www.sfbu.edu/student-health-insurance")

loader = WebBaseLoader("https://www.sfbu.edu/contact-us")

# In[ ]:


docs = loader.load()


# In[ ]:


print(docs[0].page_content[0:500])























Contact Us | San Francisco Bay University









        Skip to main content
      






















          San Francisco Bay University
        




Header Action Navigation





Visit


Apply 


Online store



























Search










Header Action Navigation





Visit


Apply 


Online store




Mega Menu


Why We're Here








Our CampusStrategic Plan






Our Leadership






Our Glossary of Terms











Learning & Teaching








U
