In [None]:
import string
import pandas as pd
import os

In [None]:
# define the input file path
input_file_path = "hobbit.txt"

# define the output directory path
output_dir_path = "Hobbit_chunked"

# create the output directory if it does not exist
if not os.path.exists(output_dir_path):
    os.mkdir(output_dir_path)

In [None]:
# define a function to clean the text
def clean_text(text):
    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # lower case
    text = text.lower()
    # remove new lines and line breaks
    text = text.replace("\n", " ").replace("\r", "")
    return text

In [None]:
# define the default chunk size
default_chunk_size = 50
default_overlap_size = 0

In [None]:
# define a function to divide the text into chunks and write them to files
def divide_text_into_chunks(tokens, chunk_size=default_chunk_size, overlap_size=default_overlap_size):
    # calculate the number of chunks
    num_chunks = (len(tokens) - chunk_size) // (chunk_size - overlap_size) + 1
    # create a data frame to store the chunks
    data = pd.DataFrame(columns=["text"])
    # iterate over the chunks
    for i in range(num_chunks):
        # get the chunk
        start = i * (chunk_size - overlap_size)
        end = start + chunk_size
        chunk = tokens[start:end]
        # create the file name
        file_name = "Hobbit_{:05d}.txt".format(i+1)
        # create the file path
        file_path = os.path.join(output_dir_path, file_name)
        # write the chunk to the file
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(" ".join(chunk))
        # add the chunk to the data frame
        data.loc[file_name] = [" ".join(chunk)]
    return data

In [None]:
# read the input file
with open(input_file_path, "r", encoding="utf-8") as f:
    text = f.read()

# clean the text
text = clean_text(text)

# tokenize the text
tokens = text.split()

In [None]:
# divide the text into chunks and write them to files
data = divide_text_into_chunks(tokens)

# print the data frame
print(data)

In [None]:
# save the data frame to a CSV file
csv_file_path = "hobbit_chunked.csv"
data.to_csv(csv_file_path)