In [None]:
import os
from openai import OpenAI
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

client = OpenAI(api_key  = os.getenv('OPENAI_API_KEY'))

# ## PDFs
#
##### Let's load a PDF [transcript](https://see.stanford.edu/materials/aimlcs229/transcripts/MachineLearning-Lecture01.pdf) from Andrew Ng's famous CS229 course! These documents are the result of automated transcription so words and sentences are sometimes split unexpectedly.

##### The course will show the pip installs you would need to install packages on your own machine.
##### These packages are already installed on this platform and should not be run again.


In [None]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("https://www.sfbu.edu/sites/default/files/2022-12/2023Catalog.pdf")
pages = loader.load()

#### Each page is a `Document`.
#
#### A `Document` contains text (`page_content`) and `metadata`.

In [None]:
len(pages)

197

In [None]:
page = pages[0]

In [None]:
print(page.page_content[0:500])

Catalog 202 3 i ver. 202 3.09.24 
161 Mission Falls Lane, Fremont, CA 94539  
Tel: (510) 803-SFBU ( 7328); e -mail: admissions@sfbu.edu  
 
 
2023 CATALOG                           JAN 1 - DEC 31, 2023   
  
  


In [None]:
page.metadata

{'source': 'https://www.sfbu.edu/sites/default/files/2022-12/2023Catalog.pdf',
 'page': 0}

# ## YouTube

In [None]:
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import OpenAIWhisperParser
from langchain_community.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

###### **Note**: This can take several minutes to complete.

In [None]:
url="https://youtu.be/AuDodQm7nm8?si=QgtvcsNofH8vqbn0"
save_dir="docs/youtube/"
loader = GenericLoader(
    YoutubeAudioLoader([url],save_dir),
    OpenAIWhisperParser()
)
docs = loader.load()
docs[0].page_content[0:500]

[youtube] Extracting URL: https://youtu.be/AuDodQm7nm8?si=QgtvcsNofH8vqbn0
[youtube] AuDodQm7nm8: Downloading webpage
[youtube] AuDodQm7nm8: Downloading ios player API JSON
[youtube] AuDodQm7nm8: Downloading android player API JSON
[youtube] AuDodQm7nm8: Downloading m3u8 information
[info] AuDodQm7nm8: Downloading 1 format(s): 140
[download] docs/youtube//SFBU DeepPiCar： Voice Control.m4a has already been downloaded
[download] 100% of  228.93KiB
[ExtractAudio] Not converting audio docs/youtube//SFBU DeepPiCar： Voice Control.m4a; file is already in target format m4a
Transcribing part 1!


'Move forward.'

# ## URLs

In [None]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://www.sfbu.edu/about-us")

In [None]:
docs = loader.load()

In [None]:
print(docs[0].page_content[:500])





















About SFBU | San Francisco Bay University









































      Skip to main content
    


 















Main navigation


About Us


Overview


University Leadership


Strategic Plan


Accreditation


Policies


Careers




Admissions


Requirements for Degree Programs


Requirements for Intensive English Program


Scholarships


Tuition & Costs


Articulation & Transfer Agreements


Contact Admissions Team


Admitted Students




Academics


Sch
