### Quick Description

This notebook implements the sentences' segmentation in two different ways. The first one is done with a machine learning model trained to predict punctuations, and the second one is solely based on a static slicing procedure.

In [None]:
import pandas as pd
import sys
import os
import glob
import re
from tqdm import tqdm

sys.path.append("../utils")
from sentence_segmenter import *

### Sentence Segmentation (Model-based Approach)

In [None]:
%%script False

segmenter = GptSegmenter()

for filename in tqdm(filenames, total=len(filenames)):
    with open(os.path.join(input_path, filename)) as file:
        text = file.read()

    # Get the segmented sentences. Each list is a phrase
    sentences = segmenter.segment(text)

    df = pd.DataFrame(map(lambda words: ' '.join(words), sentences), columns=["text"])
    df.to_csv(os.path.join(output_path, f"segmented-{re.sub(r'.txt', '.csv', filename)}"), index=False)

---

In [None]:
def segment_sentences(input_path, output_path, context_size=15):
    filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(input_path, "*"))]

    for filename in tqdm(filenames, total=len(filenames)):
        with open(os.path.join(input_path, filename)) as file:
            text = file.read().split()

        sentences = []
        initial_idx = 0
        for final_idx in range(context_size, len(text), context_size):
            sentences.append(' '.join(text[initial_idx:final_idx]))
            initial_idx = final_idx

        df = pd.DataFrame(sentences, columns=["text"])
        df.to_csv(os.path.join(output_path, filename), index=False)

---

### Sentence Segmentation (Statitic Approach)

According to [this reference](https://medium.com/@theacropolitan/sentence-length-has-declined-75-in-the-past-500-years-2e40f80f589f#:~:text=On%20average%2C%20sentences%20today%20range,per%20sentence%20in%20some%20years.), the average sentence size is somewhere between 15 ~ 20 words. So, lets segments by this approach.

In [None]:
segment_sentences(
    "../data/01_preprocessed/without_curse_words",
    "../data/02_segmented/fixed_size_context/without_curse_words"
)