In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances
from transformers import AutoTokenizer, AutoModel
import torch
import re

In [2]:
def _split_sentences(text):
    sentences = re.split(r'(?<=[.?!])\s+', text)
    return sentences

In [3]:
def _combine_sentences(sentences):
    combined_sentences = []
    for i in range(len(sentences)):
        combined_sentence = sentences[i]
        if i > 0:
            combined_sentence = sentences[i-1] + ' ' + combined_sentence
        if i < len(sentences) - 1:
            combined_sentence += ' ' + sentences[i+1]
        combined_sentences.append(combined_sentence)
    return combined_sentences

In [4]:
def _calculate_cosine_distances(embeddings):
    distances = []
    for i in range(len(embeddings) - 1):
        similarity = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
        distance = 1 - similarity
        distances.append(distance)
    return distances

def get_embeddings(texts, model_name="BAAI/bge-small-en-v1.5"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
        model_output = model(**encoded_input)

    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    return embeddings.numpy()

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [5]:
def chunk_text(text):
    single_sentences_list = _split_sentences(text)
    print(single_sentences_list)
    combined_sentences = _combine_sentences(single_sentences_list)
    print(combined_sentences)
    embeddings = get_embeddings(combined_sentences)
    distances = _calculate_cosine_distances(embeddings)

    # Determine the threshold distance for identifying breakpoints based on the 80th percentile of all distances.
    breakpoint_percentile_threshold = 80
    breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold)
    # Find all indices where the distance exceeds the calculated threshold, indicating a potential chunk breakpoint.
    indices_above_thresh = [i for i, distance in enumerate(distances) if distance > breakpoint_distance_threshold]

    chunks = []
    start_index = 0
    for index in indices_above_thresh:
        chunk = ' '.join(single_sentences_list[start_index:index+1])
        chunks.append(chunk)
        start_index = index + 1

    # If there are any sentences left after the last breakpoint, add them as the final chunk.
    if start_index < len(single_sentences_list):
        chunk = ' '.join(single_sentences_list[start_index:])
        chunks.append(chunk)

    return chunks

In [6]:
text = """
Regular exercise is essential for maintaining overall health and well-being. It helps in controlling weight,
improving cardiovascular health, and boosting mental health.
Engaging in physical activity regularly can also enhance the immune system, reduce the risk of chronic diseases,
and increase energy levels. Regular workouts are known to improve muscle strength and flexibility, which can prevent injuries and enhance mobility.
Moreover, exercise contributes to better sleep and improved mood, which are crucial for daily functioning.
Physical activity can also help reduce symptoms of anxiety and depression, leading to a more balanced emotional state.
Activities like walking, jogging, or swimming can be easily incorporated into a daily routine, making it accessible for everyone.
By setting realistic goals and staying consistent, individuals can enjoy these benefits and lead a healthier lifestyle.
Group fitness classes or sports teams can provide motivation and social support, making exercise more enjoyable and sustainable.
"""

In [7]:
chunks = chunk_text(text)
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i}:")
    print(chunk)
    print("----------------------------------------------------------------------------")

print(f"\nTotal number of chunks: {len(chunks)}")

['\nRegular exercise is essential for maintaining overall health and well-being.', 'It helps in controlling weight,\nimproving cardiovascular health, and boosting mental health.', 'Engaging in physical activity regularly can also enhance the immune system, reduce the risk of chronic diseases,\nand increase energy levels.', 'Regular workouts are known to improve muscle strength and flexibility, which can prevent injuries and enhance mobility.', 'Moreover, exercise contributes to better sleep and improved mood, which are crucial for daily functioning.', 'Physical activity can also help reduce symptoms of anxiety and depression, leading to a more balanced emotional state.', 'Activities like walking, jogging, or swimming can be easily incorporated into a daily routine, making it accessible for everyone.', 'By setting realistic goals and staying consistent, individuals can enjoy these benefits and lead a healthier lifestyle.', 'Group fitness classes or sports teams can provide motivation an

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

Chunk 1:

Regular exercise is essential for maintaining overall health and well-being. It helps in controlling weight,
improving cardiovascular health, and boosting mental health. Engaging in physical activity regularly can also enhance the immune system, reduce the risk of chronic diseases,
and increase energy levels. Regular workouts are known to improve muscle strength and flexibility, which can prevent injuries and enhance mobility.
----------------------------------------------------------------------------
Chunk 2:
Moreover, exercise contributes to better sleep and improved mood, which are crucial for daily functioning. Physical activity can also help reduce symptoms of anxiety and depression, leading to a more balanced emotional state. Activities like walking, jogging, or swimming can be easily incorporated into a daily routine, making it accessible for everyone.
----------------------------------------------------------------------------
Chunk 3:
By setting realistic goals and 