We will implement a text tiling algorithm to segment the text into coherent sections. The algorithm is based on the following steps:
1. Split the text in sentences
2. Clean the sentences by removing stopwords and punctuation
3. Find frequency table for the words in the sentences
    - The x-axis is the sentence number
    - The y-axis is each word
    - Each cell is the frequency of the word in the sentence
4. Initialize blocks of text with a fixed size
5. Iterate until no change in the blocks
    - Calculate the intra-group cohesion for each block
    - Find blocks of text where the topic changes (the cohesion drops in these blocks)
    - Move block boundaries to the topic change points
6. Return the blocks of text

In [54]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import Counter
from typing import List
from string import punctuation


In [55]:
class Sentence:
    def __init__(self, text: str, words: List[str]):
        self.text = text
        self.words = words
        
    def __repr__(self):
        return self.text
    
class Block:
    def __init__(self, sentences: List[Sentence]):
        self.sentences = sentences
        
    def __len__(self):
        return len(self.sentences)
        
    def __repr__(self):
        return " ".join([sentence.text for sentence in self.sentences])
    
    def get_string_sentences(self) -> List[str]:
        return [" ".join(sentence.words) for sentence in self.sentences]

In [56]:
# retrieve text from res/mixed_texts.txt
with open('res/mixed_texts.txt', 'r') as file:
    raw_text = file.read()

In [57]:
def preprocess_text(text: str) -> List[Sentence]:
    """
    Preprocess the text by splitting it into sentences and removing unimportant words.
    :param text: the text to be preprocessed
    :return: a list of relevant words extracted from text
    """
    # get the list of stopwords
    stopwords_list = stopwords.words('english')
    
    sentence_list: List[Sentence] = []

    # split the text into sentences
    sentences = nltk.sent_tokenize(text)
    
    # remove stopwords and punctuation
    for sentence in sentences:
        relevant_words = [word.lower() for word in nltk.word_tokenize(sentence) if word.lower() not in stopwords_list and word not in punctuation]
        sentence_list.append(Sentence(sentence, relevant_words))
        
    return sentence_list

preprocess_text(raw_text)[:5]

[In order to get to punch his badge at 08:30, 16 years before Fantozzi began setting alarm clock at 06:15.,
 Today, after continuous experiments and improvements, he manage to set it at 07:51: the limit of humanly possibilities.,
 Everything is calculated on the edge of seconds: 5 seconds to regain consciousness; 4 seconds to overcome impact of seeing his wife, and 6 more seconds to ask himself -as always with any plausible answer- whatever pushed him to marry that kind of curious pet; 3 seconds to drink Mrs Fantozzi's coffee: 3000 Fahreneit Degrees!,
 From 8 to 10 seconds to cool down his burned tongue... 2.5 seconds to kiss his daughter Mariangela; brioche and Latte meanwhile hair brushing, brushing coffee-flavoured teeth with minty toothpaste, resulting in an instantaneous bowel movement... all of this performed in 6 seconds, a European Record!,
 He still has a 3-minute fortune to get dressed and run to the bus stop to catch the 08:01 bus.]

In [58]:
# # create a frequency table for the words in the sentences
# frequency_table = []
# 
# for sentence in cleaned_sentences:
#     # get counts of word frequency in the sentence
#     frequency = Counter(sentence)
#     frequency_table.append(frequency)
#     
# # print the frequency table for the first 5 sentences
# frequency_table[:5]

In [59]:
def initialize_blocks(sentences: List[Sentence], num_blocks: int) -> List[List[str]]:
    """
    Initialize blocks of text with a fixed size. Each block will contain a subset of the text.
    :param sentences: the list of sentences to be split into blocks
    :param num_blocks: the number of blocks to initialize
    :return: a list of blocks of text
    """
    blocks = []
    block_size = len(sentences) // num_blocks
    
    for i in range(num_blocks):
        start = i * block_size
        end = (i + 1) * block_size
        block = sentences[start:end]
        blocks.append(block)
        
    return blocks

# Example usage
# sentences = preprocess_text(raw_text)
# blocks = initialize_blocks(sentences, 3)
# for i, block in enumerate(blocks):
#     print(f"Block {i+1}:")
#     for sentence in block:
#         print(sentence)
#     print()

In [62]:
def calculate_intra_block_cohesion(block: Block) -> List[float]:

    # TF-IDF vectorization
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(block.get_string_sentences())

    # Calculate cosine similarity between adjacent sentences
    cohesion_scores = []
    for i in range(len(block) - 1):
        sim_score = cosine_similarity(tfidf_matrix[i], tfidf_matrix[i+1])
        cohesion_scores.append(sim_score[0][0])
    
    return cohesion_scores

# Example usage
text = "This is a sample text. It demonstrates how to calculate intra-document cohesion between sentences. Intra-document cohesion is important for understanding the coherence of a document."
sentences = preprocess_text(text)
block = Block(sentences)
cohesion_score = calculate_intra_block_cohesion(block)
print("Intra-document cohesion score:", cohesion_score)

3
Intra-document cohesion score: [0.0, 0.4179779031144455]


In [None]:
def text_tiling(text: str, num_blocks: int, max_block_size: int) -> List[List[Sentence]]:
    """
    Segment the text into coherent sections using the text tiling algorithm.
    1. Preprocess the text so that we have a list of Sentence objects
    2. Initialize blocks (lists of Sentences) with a fixed size
    3. Iterate until no change in the blocks
        - Calculate the intra-block cohesion for each Sentence
        - Find Sentences where the topic changes (the cohesion is lower than the one of the adjacent Sentences)
        - Move block boundaries to the topic change points
    4. Return the blocks of text
    :param text: the text to be segmented
    :param num_blocks: the number of blocks to split the text into
    :param max_block_size: the maximum number of Sentences in each block
    :return: 
    """
    
    # TODO Refactor block. Now it is a class
    
    # Preprocess the text
    sentences = preprocess_text(text)
    
    # Initialize blocks of text
    blocks = initialize_blocks(sentences, num_blocks)
    
    n_times_no_change = 0
    # Iterate until no change in the blocks
    while n_times_no_change < 3:
        # Calculate the intra-group cohesion for each block
        cohesion_scores: List[float] = []
        for block in blocks:
            # Calculate the cohesion score for the block
            block_cohesion_scores = calculate_intra_block_cohesion(block)
            cohesion_scores + block_cohesion_scores
            
        # Find blocks of text where the topic changes
        change_points = find_topic_change_points(cohesion_scores)
        
        # Move block boundaries to the topic change points
        new_blocks = move_block_boundaries(blocks, change_points, max_block_size)
        
        # Check if the blocks have changed
        if new_blocks == blocks:
            n_times_no_change += 1
        else:
            n_times_no_change = 0
            blocks = new_blocks
    