In [25]:
import string
import pandas as pd
import os

In [26]:
# define the input file path
input_file_path = "hobbit.txt"

# define the output directory path
output_dir_path = "Hobbit_chunked"

# create the output directory if it does not exist
if not os.path.exists(output_dir_path):
    os.mkdir(output_dir_path)

In [27]:
# define a function to clean the text
def clean_text(text):
    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # lower case
    text = text.lower()
    # remove new lines and line breaks
    text = text.replace("\n", " ").replace("\r", "")
    return text

In [28]:
# define the default chunk size
default_chunk_size = 50
default_overlap_size = 0

In [29]:
# define a function to divide the text into chunks and write them to files
def divide_text_into_chunks(tokens, chunk_size=default_chunk_size, overlap_size=default_overlap_size):
    # calculate the number of chunks
    num_chunks = (len(tokens) - chunk_size) // (chunk_size - overlap_size) + 1
    # create a data frame to store the chunks
    data = pd.DataFrame(columns=["text"])
    # iterate over the chunks
    for i in range(num_chunks):
        # get the chunk
        start = i * (chunk_size - overlap_size)
        end = start + chunk_size
        chunk = tokens[start:end]
        # create the file name
        file_name = "Hobbit_{:05d}.txt".format(i+1)
        # create the file path
        file_path = os.path.join(output_dir_path, file_name)
        # write the chunk to the file
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(" ".join(chunk))
        # add the chunk to the data frame
        data.loc[file_name] = [" ".join(chunk)]
    return data

In [30]:
# read the input file
with open(input_file_path, "r", encoding="utf-8") as f:
    text = f.read()

# clean the text
text = clean_text(text)

# tokenize the text
tokens = text.split()

In [31]:
# divide the text into chunks and write them to files
data = divide_text_into_chunks(tokens)

# print the data frame
print(data)

                                                               text
Hobbit_00001.txt  chapter i an unexpected party in a hole in the...
Hobbit_00002.txt  was a hobbithole and that means comfort it had...
Hobbit_00003.txt  tiled and carpeted provided with polished chai...
Hobbit_00004.txt  round called it and many little round doors op...
Hobbit_00005.txt  indeed on the same passage the best rooms were...
...                                                             ...
Hobbit_01901.txt  to him from the lake and from south and west a...
Hobbit_01902.txt  wealth went up and down the running river and ...
Hobbit_01903.txt  such disease he fell under the dragonsickness ...
Hobbit_01904.txt  the present prosperity they are making songs w...
Hobbit_01905.txt  disbelieve the prophecies because you had a ha...

[1905 rows x 1 columns]


In [32]:
# save the data frame to a CSV file
csv_file_path = "hobbit_chunked.csv"
data.to_csv(csv_file_path)