In [32]:
!pip install faker



In [33]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from faker import Faker
import sys

In [None]:
def setup_nltk():
    """Setup NLTK by downloading required data silently."""
    try:
        # Download required NLTK data 
        nltk.download('punkt', quiet=True)
        nltk.download('punkt_tab', quiet=True)
    except Exception as e:
        print(f"Error downloading NLTK data: {e}")
        print("Please run this command manually:")
        print("import nltk; nltk.download('punkt'); nltk.download('punkt_tab')")
        sys.exit(1)

MAX_TOKENS = 4000  # Maximum tokens allowed in context window



In [35]:
def count_tokens(text):
    """Count the number of tokens in text."""
    try:
        return len(text.split())  # Using simple split as fallback
    except:
        return len(text)

def split_tokens(text, max_length):
    """Split text into chunks of specified maximum length."""
    words = text.split()  # Using simple split instead of word_tokenize
    chunks = []

    for i in range(0, len(words), max_length):
        chunk = ' '.join(words[i:i + max_length])
        chunks.append(chunk)

    return chunks



In [None]:
def create_summary(text, target_length=None):
    """
    Create a summary of the text. If target_length is specified,
    truncate to that length. Otherwise, return original text.
    """
    if not target_length:
        return text

    words = text.split()  
    return ' '.join(words[:target_length])

def calculate_target_lengths(len1, len2, total_target):
    """Calculate proportional target lengths for two texts."""
    total = len1 + len2
    if total == 0:
        return 0, 0

    prop1 = len1 / total
    prop2 = len2 / total

    return (
        int(total_target * prop1),
        int(total_target * prop2)
    )


In [37]:

def summarize_long_text(text, target_length):
    """
    Summarize long text to fit within target length using
    a simple hierarchical approach.
    """
    # If text is already short enough, return it
    if count_tokens(text) <= target_length:
        return text

    # Split into chunks and summarize each
    chunks = split_tokens(text, target_length)
    summaries = [create_summary(chunk, target_length // len(chunks))
                for chunk in chunks]

    # Combine summaries
    final_summary = ' '.join(summaries)

    # If still too long, summarize again
    if count_tokens(final_summary) > target_length:
        final_summary = create_summary(final_summary, target_length)

    return final_summary



In [38]:
def save_text(text, filename):
    """Save text to file."""
    try:
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(text)
    except Exception as e:
        print(f"Error saving file {filename}: {e}")

def show_summary(filename, num_sentences=5):
    """Show first few sentences of a file."""
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            text = file.read()
            # Simple sentence splitting as fallback
            sentences = text.split('. ')[:num_sentences]
            print(f"\n=== {filename} ===")
            print('. '.join(sentences) + '.')
    except Exception as e:
        print(f"Error reading file {filename}: {e}")



In [39]:

# Setup NLTK
setup_nltk()

# Generate sample texts
fake = Faker()
text1 = " ".join(fake.paragraphs(nb=200))
text2 = " ".join(fake.paragraphs(nb=500))

# Get lengths and calculate targets
len1 = count_tokens(text1)
len2 = count_tokens(text2)
target1, target2 = calculate_target_lengths(len1, len2, MAX_TOKENS)

# Create and save summaries
summary1 = summarize_long_text(text1, target1)
summary2 = summarize_long_text(text2, target2)

save_text(summary1, 'summary1.txt')
save_text(summary2, 'summary2.txt')

# Display results
print("\nSummaries created successfully!")
show_summary('summary1.txt')
show_summary('summary2.txt')




Summaries created successfully!

=== summary1.txt ===
Produce rather difficult. Several important over agency democratic smile. Political recent it glass wish method wind. Source and herself former work of science. Safe activity television those suggest effect area.

=== summary2.txt ===
But listen myself less usually within. Anything international magazine modern month small. Short event dog collection. Fast up light increase. Effect himself price free would traditional question.
