In [1]:
import os
import re
from typing import List, Dict
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [5]:
class TextProcessor:
    def __init__(self, model_name="TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
    
    def get_first_n_sentences(self, text: str, n: int = 5) -> List[str]:
        """Get only the first n sentences from text."""
        sentences = []
        current_sentence = []
        
        for char in text:
            current_sentence.append(char)
            if char in '.!?':
                # Check for actual end of sentence (not e.g. "Mr.")
                sentence = ''.join(current_sentence).strip()
                if sentence and not sentence.endswith(('Mr.', 'Mrs.', 'Dr.', 'Ms.')):
                    sentences.append(sentence)
                    if len(sentences) >= n:
                        break
                    current_sentence = []
            
        return sentences
    
    def get_first_n_paragraphs(self, text: str, n: int = 5) -> List[str]:
        """Get only the first n paragraphs from text."""
        paragraphs = []
        current = []
        
        lines = text.split('\n')
        for line in lines:
            if line.strip():
                current.append(line.strip())
            elif current:  # Empty line and we have content
                paragraphs.append(' '.join(current))
                current = []
                if len(paragraphs) >= n:
                    break
        
        # Add the last paragraph if we haven't reached n
        if current and len(paragraphs) < n:
            paragraphs.append(' '.join(current))
            
        return paragraphs[:n]
    
    def get_embedding(self, text: str) -> torch.Tensor:
        """Get embedding for a piece of text using TinyLlama."""
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1)
        return embedding
    
    def process_text_file(self, file_path: str) -> Dict[str, Dict]:
        """Process a text file and return first n sentences, paragraphs, and full text."""
        with open(file_path, 'r', encoding='utf-8') as file:
            full_text = file.read()
        
        # Get exactly 5 sentences and 5 paragraphs
        first_sentences = self.get_first_n_sentences(full_text, 5)
        first_paragraphs = self.get_first_n_paragraphs(full_text, 5)
        
        result = {
            'sentences': {
                'texts': first_sentences,
                'embeddings': [self.get_embedding(s) for s in first_sentences]
            },
            'paragraphs': {
                'texts': first_paragraphs,
                'embeddings': [self.get_embedding(p) for p in first_paragraphs]
            },
            'full_text': {
                'text': full_text,
                'embedding': self.get_embedding(full_text)
            }
        }
        
        return result

In [6]:
def process_all_files(data_dir: str) -> Dict[str, Dict]:
    """Process all text files in the data directory."""
    processor = TextProcessor()
    all_results = {}
    
    for filename in os.listdir(data_dir):
        if filename.endswith('.txt'):
            file_path = os.path.join(data_dir, filename)
            category = filename.replace('.txt', '')
            print(f"Processing {category}...")
            all_results[category] = processor.process_text_file(file_path)

    return all_results

In [8]:
if __name__ == "__main__":
    data_dir = "./data"
    results = process_all_files(data_dir)
    
    # Print summary for each category
    for category, data in results.items():
        print(f"\nCategory: {category}")
        print(f"Number of sentences processed: {len(data['sentences']['texts'])}")
        print(f"Number of paragraphs processed: {len(data['paragraphs']['texts'])}")
        print("\nFirst sentence:", data['sentences']['texts'][0][:100], "...")
        print("\nFirst paragraph:", data['paragraphs']['texts'][0][:100], "...")

FileNotFoundError: [Errno 2] No such file or directory: './data'