In [2]:
from langchain_community.document_loaders import UnstructuredURLLoader

loader = UnstructuredURLLoader(
    urls=["https://www.cnbc.com/2024/12/21/how-the-federal-reserves-rate-policy-affects-mortgages.html"]
)
docs = loader.load()
len(docs)

1

In [3]:
docs[0].metadata

{'source': 'https://www.cnbc.com/2024/12/21/how-the-federal-reserves-rate-policy-affects-mortgages.html'}

In [4]:
docs[0].page_content

'Access Denied\n\nYou don\'t have permission to access "http://www.cnbc.com/2024/12/21/how-the-federal-reserves-rate-policy-affects-mortgages.html" on this server.\n\nReference #18.a624c317.1755447240.1c3be6b\n\nhttps://errors.edgesuite.net/18.a624c317.1755447240.1c3be6b'

### CSV Loader

In [5]:
from langchain_community.document_loaders import CSVLoader

loader = CSVLoader(file_path="patient_records.csv")
docs = loader.load()
docs[0].metadata

{'source': 'patient_records.csv', 'row': 0}

In [6]:
print(docs[0].page_content)

patient_id: PT-1000
symptoms: Dizziness, irregular heartbeat, fatigue
diagnosis: Atrial Fibrillation
treatment: Blood thinners, beta-blockers
doctor_notes: Irregular heartbeat detected; cardiology referral made.


### Text Splitter

In [7]:
text = '''Hometown Cha-Cha-Cha is a 2021 South Korean romantic comedy drama television series starring Shin Min-a, Kim Seon-ho and Lee Sang-yi. It is a remake of 2004 South Korean film Mr. Handy, Mr. Hong.[6] It aired on tvN from August 28 to October 17, 2021, every Saturday and Sunday at 21:00 (KST).[7][8] It is also available for streaming on Netflix.[9]

The series was a commercial hit and became one of the highest-rated dramas in Korean cable television history.[10][11] It ranked first place during its entire run for eight weeks, and the last episode achieved 12.665% nationwide rating, with over 3.2 million views.[12] It also became one of Netflix's most-watched non-English television shows, and one of its longest-running hits as it spent 16 weeks in global top ten rankings.
'''

In [8]:
from langchain_text_splitters import CharacterTextSplitter

In [9]:
splitter = CharacterTextSplitter(
     separator = "\n",
    chunk_size=200,
    chunk_overlap=0
)

In [10]:
chunks = splitter.split_text(text)

Created a chunk of size 348, which is longer than the specified 200


In [11]:
len(chunks)

2

In [12]:
len(chunks[0]),len(chunks[1])

(348, 429)

In [13]:
chunks[0]

'Hometown Cha-Cha-Cha is a 2021 South Korean romantic comedy drama television series starring Shin Min-a, Kim Seon-ho and Lee Sang-yi. It is a remake of 2004 South Korean film Mr. Handy, Mr. Hong.[6] It aired on tvN from August 28 to October 17, 2021, every Saturday and Sunday at 21:00 (KST).[7][8] It is also available for streaming on Netflix.[9]'

### RecursiveCharacterTextSplitter

In [14]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

r_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", "\n", " "],  # List of separators based on requirement (defaults to ["\n\n", "\n", " "])
    chunk_size = 200,  # size of each chunk created
    chunk_overlap  = 30,  # size of  overlap between chunks in order to maintain the context
    length_function = len  # Function to calculate size, currently we are using "len" which denotes length of string however you can pass any token counter)
)

In [16]:
chunks = r_splitter.split_text(text)

for chunk in chunks :
    print(len(chunk))

198
178
192
196
91


In [17]:
chunks[0]

'Hometown Cha-Cha-Cha is a 2021 South Korean romantic comedy drama television series starring Shin Min-a, Kim Seon-ho and Lee Sang-yi. It is a remake of 2004 South Korean film Mr. Handy, Mr. Hong.[6]'

In [18]:
chunks[1]

'film Mr. Handy, Mr. Hong.[6] It aired on tvN from August 28 to October 17, 2021, every Saturday and Sunday at 21:00 (KST).[7][8] It is also available for streaming on Netflix.[9]'

In [19]:
chunks[2]

'The series was a commercial hit and became one of the highest-rated dramas in Korean cable television history.[10][11] It ranked first place during its entire run for eight weeks, and the last'