### The following imports the necessary for the chunking process


### *!!! This process had to be run on Google Collab's T4 GPU as it is computationally intensive, the code was transfered here from the google collab notebook after the files were processed and saved manually into the larger folder, file must be adjusted for running outside of google collab!!!*

In [2]:
import torch
print(torch.cuda.is_available())


False


In [4]:
!pip install transformers torch nltk
!pip uninstall torch transformers -y
!pip install torch transformers




### the following takes the cleaned files and and iterates over them whole chinking each one into semantic chunks determined using a RoBERTa model for text classification

In [None]:
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize
from transformers import RobertaTokenizer, RobertaModel
import os
import torch
import numpy as np
import nltk

# Download the punkt tokenizer
nltk.download('punkt')

# Tokenization function
def tokenize_text(text):
    sentences = sent_tokenize(text)
    return sentences

# Function to calculate semantic similarity between sentences using RoBERTa
def calculate_similarity(sentence_embedding, chunk_embedding):
    return np.dot(sentence_embedding, chunk_embedding) / (np.linalg.norm(sentence_embedding) * np.linalg.norm(chunk_embedding))

# Function to embed a sentence using RoBERTa
def embed_sentence(sentence, tokenizer, model):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        sentence_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return sentence_embedding

# Semantic Chunking function
def chunk_text_semantic(sentences, tokenizer, model, max_tokens=400, similarity_threshold=0.7):
    chunks = []
    current_chunk = []
    current_chunk_embedding = None

    for sentence in sentences:
        sentence_embedding = embed_sentence(sentence, tokenizer, model)
        if current_chunk_embedding is None:
            current_chunk_embedding = sentence_embedding
            current_chunk.append(sentence)
        else:
            similarity = calculate_similarity(sentence_embedding, current_chunk_embedding)
            if similarity > similarity_threshold:
                current_chunk.append(sentence)
                current_chunk_embedding = (current_chunk_embedding + sentence_embedding) / 2
            else:
                chunks.append(" ".join(current_chunk))
                current_chunk = [sentence]
                current_chunk_embedding = sentence_embedding

        # Check if the current chunk exceeds the token limit
        current_chunk_tokens = sum(len(tokenizer.tokenize(sent)) for sent in current_chunk)
        if current_chunk_tokens > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_chunk_embedding = None

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Overlap function
def create_overlap_chunks(chunks, overlap_size=2):
    overlapped_chunks = []
    for i in range(len(chunks)):
        if i > 0:
            overlap = chunks[i-1].split()[-overlap_size:]
            overlapped_chunks.append(" ".join(overlap + chunks[i].split()))
        else:
            overlapped_chunks.append(chunks[i])
    return overlapped_chunks

# Path to the folder where the cleaned text files are stored
folder_path = '/content/drive/MyDrive/cleaned_output_folder'  # Ensure this is the correct path
output_folder = '/content/drive/MyDrive/semantic_chunks'  # Folder to store the chunks

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
model.eval()  # Set the model to evaluation mode

# Function to read each text file
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):  # Ensure only text files are processed
        file_path = os.path.join(folder_path, filename)
        print(f"Processing {filename}...")

        # Read the content of the text file
        text = read_text_file(file_path)

        # Tokenize the text into sentences
        sentences = tokenize_text(text)

        # Chunk the text semantically
        chunks = chunk_text_semantic(sentences, tokenizer, model)

        # Apply overlapping to the chunks
        overlapped_chunks = create_overlap_chunks(chunks)

        # Save the semantic chunks to a file
        chunk_file_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}_chunks.txt")
        with open(chunk_file_path, 'w', encoding='utf-8') as chunk_file:
            for chunk in overlapped_chunks:
                chunk_file.write(chunk + "\n\n")  # Save each chunk with a double newline for separation

        print(f"Chunks of {filename} saved to {output_folder}")
