In [1]:
from dotenv import load_dotenv

load_dotenv('../../.env')

True

# Character Text Splitters

In [2]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("Linux.pdf")
pages = loader.load_and_split()

In [3]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
texts = text_splitter.split_documents(pages)

print(texts[0])

print (f"You have {len(texts)} documents")
print ("Preview:")
print (texts[0].page_content)

page_content='THE ONE     PAGE LINUX MANUALA summary of useful Linux commands\nVersion 3.0 May 1999 squadron@powerup.com.au\nStarting & Stopping\nshutdown -h now Shutdown the system now and do not\nreboot\nhalt Stop all processes - same as above\nshutdown -r 5 Shutdown the system in 5 minutes and\nreboot\nshutdown -r now Shutdown the system now and reboot\nreboot Stop all processes and then reboot - same\nas above\nstartx Start the X system\nAccessing & mounting file systems\nmount -t iso9660 /dev/cdrom\n/mnt/cdromMount the device cdrom\nand call it cdrom under the\n/mnt directory\nmount -t msdos /dev/hdd\n/mnt/ddriveMount hard disk “d” as a\nmsdos file system and call\nit ddrive under the /mnt\ndirectory\nmount -t vfat /dev/hda1\n/mnt/cdriveMount hard disk “a” as a\nVFAT file system and call it\ncdrive under the /mnt\ndirectory\numount /mnt/cdrom Unmount the cdrom\nFinding files and text within files\nfind / -name  fname Starting with the root directory, look\nfor the file called fnam

# Recursive Character Text Splitter

In [7]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFLoader("Linux.pdf")
pages = loader.load_and_split()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=50,
    chunk_overlap=10,
    length_function=len,
)

docs = text_splitter.split_documents(pages)
for doc in docs:
    print(doc)

page_content='THE ONE     PAGE LINUX MANUALA summary of useful' metadata={'source': 'Linux.pdf', 'page': 0}
page_content='of useful Linux commands' metadata={'source': 'Linux.pdf', 'page': 0}
page_content='Version 3.0 May 1999 squadron@powerup.com.au' metadata={'source': 'Linux.pdf', 'page': 0}
page_content='Starting & Stopping' metadata={'source': 'Linux.pdf', 'page': 0}
page_content='shutdown -h now Shutdown the system now and do' metadata={'source': 'Linux.pdf', 'page': 0}
page_content='and do not' metadata={'source': 'Linux.pdf', 'page': 0}
page_content='reboot\nhalt Stop all processes - same as above' metadata={'source': 'Linux.pdf', 'page': 0}
page_content='shutdown -r 5 Shutdown the system in 5 minutes' metadata={'source': 'Linux.pdf', 'page': 0}
page_content='5 minutes and' metadata={'source': 'Linux.pdf', 'page': 0}
page_content='reboot' metadata={'source': 'Linux.pdf', 'page': 0}
page_content='shutdown -r now Shutdown the system now and' metadata={'source': 'Linux.pdf', 'page

# NLTK text Splitter

In [11]:
import nltk 
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ghost\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
from langchain.text_splitter import NLTKTextSplitter

# Load a long document
with open('llm.txt', encoding= 'unicode_escape') as f:
    sample_text = f.read()

text_splitter = NLTKTextSplitter(chunk_size=500)
texts = text_splitter.split_text(sample_text)
print(texts)

["AI-generated content is becoming increasingly sophisticated, making it challenging to distinguish between genuine and computer-generated text.Fraud emails and \nFake news are becoming everyday's story.My project aims to tackle this issue by leveraging the power of BERT (Bidirectional Encoder Representations from Transformers) to identify and \nflag AI-generated text segments.\n\nThis would allow the AI to advance at much faster pace but not at the expense of human security and fundamental integrity.", 'This would allow the AI to advance at much faster pace but not at the expense of human security and fundamental integrity.\n\nThe solution follows a comprehensive approach to detect AI-generated texts ð\x9f\x92¡\n\nData Preprocessing: We clean and preprocess the textual data, removing the noise, stop words, puntuations and non-alphabatical words.', 'This was done using Bert-Preprocess\n\nAdditional Datasets I collected various datasets which were hosted by various competitions like the

# SpacyTextSplitter

In [18]:
from langchain.text_splitter import SpacyTextSplitter

# Load a long document
with open('llm.txt', encoding= 'unicode_escape') as f:
    sample_text = f.read()

# Instantiate the SpacyTextSplitter with the desired chunk size
text_splitter = SpacyTextSplitter(chunk_size=500, chunk_overlap=20)

# Split the text using SpacyTextSplitter
texts = text_splitter.split_text(sample_text)

# Print the first chunk
print(texts[0])

# MarkdownTextSplitter

In [19]:
from langchain.text_splitter import MarkdownTextSplitter

markdown_text = """
# 

# Welcome to My Blog!

## Introduction
Hello everyone! My name is **John Doe** and I am a _software developer_. I specialize in Python, Java, and JavaScript.

Here's a list of my favorite programming languages:

1. Python
2. JavaScript
3. Java

You can check out some of my projects on [GitHub](https://github.com).

## About this Blog
In this blog, I will share my journey as a software developer. I'll post tutorials, my thoughts on the latest technology trends, and occasional book reviews.

Here's a small piece of Python code to say hello:

\``` python
def say_hello(name):
    print(f"Hello, {name}!")

say_hello("John")
\```

Stay tuned for more updates!

## Contact Me
Feel free to reach out to me on [Twitter](https://twitter.com) or send me an email at johndoe@email.com.

"""

markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)
docs = markdown_splitter.create_documents([markdown_text])

print(docs)

[Document(page_content='# \n\n# Welcome to My Blog!', metadata={}), Document(page_content='## Introduction', metadata={}), Document(page_content='Hello everyone! My name is **John Doe** and I am a _software developer_. I specialize in Python,', metadata={}), Document(page_content='Java, and JavaScript.', metadata={}), Document(page_content="Here's a list of my favorite programming languages:\n\n1. Python\n2. JavaScript\n3. Java", metadata={}), Document(page_content='You can check out some of my projects on [GitHub](https://github.com).', metadata={}), Document(page_content='## About this Blog', metadata={}), Document(page_content="In this blog, I will share my journey as a software developer. I'll post tutorials, my thoughts on", metadata={}), Document(page_content='the latest technology trends, and occasional book reviews.', metadata={}), Document(page_content="Here's a small piece of Python code to say hello:", metadata={}), Document(page_content='\\``` python\ndef say_hello(name):\n

# TokenTextSplitter

In [20]:
from langchain.text_splitter import TokenTextSplitter

# Load a long document
with open('llm.txt', encoding= 'unicode_escape') as f:
    sample_text = f.read()

# Initialize the TokenTextSplitter with desired chunk size and overlap
text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=50)

# Split into smaller chunks
texts = text_splitter.split_text(sample_text)
print(texts[0])

AI-generated content is becoming increasingly sophisticated, making it challenging to distinguish between genuine and computer-generated text.Fraud emails and 
Fake news are becoming everyday's story.My project aims to tackle this issue by leveraging the power of BERT (Bidirectional Encoder Representations from Transformers) to identify and 
flag AI-generated text segments. This would allow the AI to advance at much faster pace but not at the expense of human security and fundamental integrity.

The
