In [1]:
import pandas as pd

In [2]:
def segment_text(text: str, label: str, segment_length: int = 20) -> list:
    """Assigns a label to segments of a text.

    Args:
      text: text to be segmented and labeled.
      label: label to be used.
      segment_length: length of segments for the text.

    Returns:
      List of tuples of segments and the label associated.
    """
    words = text.split()
    segments = [(" ".join(words[i:i + segment_length]), label) 
                for i in range(0, len(words), segment_length)]
    return segments

In [3]:
def process_dataframe(df: pd.DataFrame, text_col: str = 'text', label_col: str = 'label') -> pd.DataFrame:
    """Creates a new dataframe segmenting a text column and labeling each segment.

    Args:
      df: dataframe to be processed
      text_col: name of text column
      label_col: name of label column

    Returns:
      New dataframe with segmented text column
    """
    segments = []
    for _, row in df.iterrows():
        text = row[text_col]
        label = row[label_col]
        segments.extend(segment_text(text, label))
    return pd.DataFrame(segments, columns=[text_col, label_col])

In [4]:
# Load the ods file
input_file = './data/writings.ods'
df = pd.read_excel(input_file, engine="odf")

# Process the df
processed_df = process_dataframe(df)

# Save the processed df to a new ods file
output_file = './data/processed.ods'
processed_df.to_excel(output_file, index=False, engine='odf')