# Web Mining and Applied NLP (44-620)

## Final Project: Article Summarizer

### Student Name: Jason A. Ballard 

Perform the tasks described in the Markdown cells below.  When you have completed the assignment make sure your code cells have all been run (and have output beneath them) and ensure you have committed and pushed ALL of your changes to your assignment repository.

You should bring in code from previous assignments to help you answer the questions below.

Every question that requires you to write code will have a code cell underneath it; you may either write your entire solution in that cell or write it in a python file (`.py`), then import and run the appropriate code to answer the question.

In [None]:
# Dependencies
import requests  # This is for making HTTP requests
from bs4 import BeautifulSoup   # This is for web scraping
from collections import Counter # This is a counter for counting words
import html5lib # This is a parser for BeautifulSoup
import ipykernel # This is the kernel for Jupyter Notebooks
import spacy # This is the natural language processing library
from spacytextblob import spacytextblob # This is a custom extension for spacy
import jupyterlab    
import matplotlib.pyplot as plt 
from wordcloud import WordCloud  
import nltk
from textblob import TextBlob
import statistics
from typing import Tuple, List
import numpy as np
import re
import pandas as pd
import seaborn as sns


print("All imports are working!")

In [None]:
# Notebook conversion
import nbconvert
import nbformat
from nbconvert import HTMLExporter
from nbconvert.preprocessors import ExecutePreprocessor
import os

print("All imports are working!")

In [None]:
# Quick test to confirm package availability
try:
    import requests, bs4, pickle, collections, html5lib, ipykernel, spacy, spacytextblob, jupyterlab, matplotlib, wordcloud
    print("All packages are available!")
except ImportError as e:
    print(f"Missing package: {e}")

In [None]:
# Don't forget to load the en_core_web_sm
nlp = spacy.load('en_core_web_sm')
# nlp.add_pipe(spacytextblob)
print("Model loaded successfully!")

1. Find on the internet an article or blog post about a topic that interests you and you are able to get the text for using the technologies we have applied in the course.  Get the html for the article and store it in a file (which you must submit with your project)

In [None]:
# URL of the article
url = "https://www.gutenberg.org/cache/epub/18868/pg18868-images.html"

# Fetching the HTML content
response = requests.get(url)
if response.status_code == 200:
    html_content = response.text

    # Save HTML to a file
    with open("kitchener.html", "w", encoding="utf-8") as file:
        file.write(html_content)

    print("HTML content successfully saved to 'kitchener.html'.")
else:
    print(f"Failed to fetch the article. Status code: {response.status_code}")

2. Read in your article's html source from the file you created in question 1 and do sentiment analysis on the article/post's text (use `.get_text()`).  Print the polarity score with an appropriate label.  Additionally print the number of sentences in the original article (with an appropriate label)

In [None]:
# Download required NLTK data
def setup_nltk():
    """Download required NLTK datasets."""
    try:
        nltk.download('punkt')
        nltk.download('averaged_perceptron_tagger')
    except Exception as e:
        print(f"Error downloading NLTK data: {e}")
        raise

def clean_article_text(soup: BeautifulSoup) -> str:
    """Extract main article content while removing navigation, headers, footers, etc."""
    # Remove unwanted elements
    for element in soup.find_all(['script', 'style', 'nav', 'header', 'footer', 'aside']):
        element.decompose()
    
    # Focus on paragraph content
    paragraphs = soup.find_all('p')
    clean_text = ' '.join(p.get_text().strip() for p in paragraphs)
    return clean_text

def analyze_sentiment(text: str) -> Tuple[float, int]:
    """Analyze sentiment and return polarity and sentence count."""
    blob = TextBlob(text)
    
    # Get overall sentiment
    polarity = blob.sentiment.polarity
    
    # Count sentences (using string splitting as backup)
    try:
        sentence_count = len(blob.sentences)
    except:
        # Fallback method if sentence tokenization fails
        sentence_count = len([s for s in text.split('.') if s.strip()])
    
    return polarity, sentence_count

def main():
    # Set up NLTK first
    setup_nltk()
    
    try:
        # Read HTML content
        with open("kitchener.html", "r", encoding="utf-8") as file:
            html_content = file.read()
        
        # Parse and clean the content
        soup = BeautifulSoup(html_content, "html.parser")
        clean_text = clean_article_text(soup)
        
        # Analyze sentiment
        overall_polarity, sentence_count = analyze_sentiment(clean_text)
        
        # Print analysis results
        print("\nSentiment Analysis Results:")
        print("=" * 50)
        print(f"Overall Polarity Score: {overall_polarity:.3f}")
        print(f"Total Sentences: {sentence_count}")
        
        # Interpret sentiment
        sentiment_interpretation = (
            "very negative" if overall_polarity <= -0.5 else
            "negative" if overall_polarity < 0 else
            "neutral" if overall_polarity == 0 else
            "positive" if overall_polarity < 0.5 else
            "very positive"
        )
        
        print(f"\nInterpretation: The article's tone is {sentiment_interpretation}")
        print("=" * 50)
        
    except Exception as e:
        print(f"Error during analysis: {e}")

if __name__ == "__main__":
    main()

3. Load the article text into a trained `spaCy` pipeline, and determine the 5 most frequent tokens (converted to lower case).  Print the common tokens with an appropriate label.  Additionally, print the tokens their frequencies (with appropriate labels)

In [None]:
def get_cleaned_text(file_path: str) -> str:
    """Read and clean HTML content."""
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file.read(), "html.parser")
        
    # Remove unwanted elements
    for element in soup.find_all(['script', 'style', 'nav', 'header', 'footer']):
        element.decompose()
        
    # Get text from paragraphs
    return ' '.join(p.get_text().strip() for p in soup.find_all('p'))

def analyze_token_frequency(text: str, nlp) -> List[Tuple[str, int]]:
    """Analyze token frequency using spaCy."""
    doc = nlp(text)
    
    # Create tokens list excluding stopwords and punctuation
    tokens = [token.text.lower() for token in doc 
              if not token.is_stop 
              and not token.is_punct 
              and not token.is_space
              and len(token.text.strip()) > 1]  # Exclude single characters
    
    # Count token frequencies
    return Counter(tokens).most_common(5)

def main():
    # Load spaCy model
    nlp = spacy.load('en_core_web_sm')
    
    # Get clean text from HTML
    clean_text = get_cleaned_text("kitchener.html")
    
    # Get most common tokens
    common_tokens = analyze_token_frequency(clean_text, nlp)
    
    # Print results
    print("\nTop 5 Most Common Words (excluding stopwords):")
    print("=" * 50)
    print("Word Frequencies:")
    for word, freq in common_tokens:
        print(f"'{word}': {freq} occurrences")
    
    print("\nWords in order of frequency:")
    words_only = [word for word, _ in common_tokens]
    print(", ".join(words_only))

if __name__ == "__main__":
    main()

4. Load the article text into a trained `spaCy` pipeline, and determine the 5 most frequent lemmas (converted to lower case).  Print the common lemmas with an appropriate label.  Additionally, print the lemmas with their frequencies (with appropriate labels).

In [None]:
def get_cleaned_text(file_path: str) -> str:
    """Read and clean HTML content."""
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file.read(), "html.parser")
        
    # Remove unwanted elements
    for element in soup.find_all(['script', 'style', 'nav', 'header', 'footer']):
        element.decompose()
        
    # Get text from paragraphs
    return ' '.join(p.get_text().strip() for p in soup.find_all('p'))

def analyze_lemma_frequency(text: str, nlp) -> List[Tuple[str, int]]:
    """Analyze lemma frequency using spaCy."""
    doc = nlp(text)
    
    # Create lemmas list excluding stopwords and punctuation
    lemmas = [token.lemma_.lower() for token in doc 
              if not token.is_stop 
              and not token.is_punct 
              and not token.is_space
              and len(token.lemma_.strip()) > 1]  # Exclude single characters
    
    # Count lemma frequencies
    return Counter(lemmas).most_common(5)

def main():
    # Load spaCy model
    nlp = spacy.load('en_core_web_sm')
    
    # Get clean text from HTML
    clean_text = get_cleaned_text("kitchener.html")
    
    # Get most common lemmas
    common_lemmas = analyze_lemma_frequency(clean_text, nlp)
    
    # Print results
    print("\nTop 5 Most Common Lemmas (excluding stopwords):")
    print("=" * 50)
    print("\nLemma Frequencies:")
    for lemma, freq in common_lemmas:
        print(f"'{lemma}': {freq} occurrences")
    
    print("\nLemmas in order of frequency:")
    lemmas_only = [lemma for lemma, _ in common_lemmas]
    print(", ".join(lemmas_only))
    print("=" * 50)

if __name__ == "__main__":
    main()

5. Make a list containing the scores (using tokens) of every sentence in the article, and plot a histogram with appropriate titles and axis labels of the scores. From your histogram, what seems to be the most common range of scores (put the answer in a comment after your code)?

In [None]:
def get_cleaned_text(file_path: str) -> str:
    """Read and clean HTML content."""
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file.read(), "html.parser")
    
    # Remove unwanted elements
    for element in soup.find_all(['script', 'style', 'nav', 'header', 'footer']):
        element.decompose()
        
    # Get text from paragraphs
    return ' '.join(p.get_text().strip() for p in soup.find_all('p'))

def get_sentence_scores(text: str, nlp) -> List[int]:
    """Calculate token-based scores for each sentence."""
    doc = nlp(text)
    
    # Calculate scores (number of tokens excluding punctuation and whitespace)
    scores = []
    for sent in doc.sents:
        score = len([token for token in sent 
                    if not token.is_punct 
                    and not token.is_space])
        scores.append(score)
    
    return scores

def plot_score_histogram(scores: List[int]) -> None:
    """Create and display a histogram of sentence scores."""
    plt.figure(figsize=(12, 6))
    
    # Calculate optimal number of bins using Freedman-Diaconis rule
    q75, q25 = np.percentile(scores, [75, 25])
    iqr = q75 - q25
    bin_width = 2 * iqr / (len(scores) ** (1/3))
    n_bins = int(np.ceil((max(scores) - min(scores)) / bin_width))
    
    # Create histogram
    plt.hist(scores, bins=n_bins, edgecolor='black', alpha=0.7, color='skyblue')
    
    # Add titles and labels
    plt.title('Distribution of Sentence Lengths (Token Count)', pad=20, fontsize=14)
    plt.xlabel('Number of Tokens per Sentence', fontsize=12)
    plt.ylabel('Frequency (Number of Sentences)', fontsize=12)
    
    # Add grid for better readability
    plt.grid(True, alpha=0.3)
    
    # Calculate and display mean and median
    mean_score = np.mean(scores)
    median_score = np.median(scores)
    plt.axvline(mean_score, color='red', linestyle='dashed', alpha=0.8, 
                label=f'Mean: {mean_score:.1f}')
    plt.axvline(median_score, color='green', linestyle='dashed', alpha=0.8, 
                label=f'Median: {median_score:.1f}')
    
    plt.legend()
    
    # Adjust layout
    plt.tight_layout()

def main():
    # Load spaCy model
    nlp = spacy.load('en_core_web_sm')
    
    # Get clean text from HTML
    clean_text = get_cleaned_text("kitchener.html")
    
    # Get sentence scores
    scores = get_sentence_scores(clean_text, nlp)
    
    # Print basic statistics
    print("\nSentence Length Statistics:")
    print("=" * 50)
    print(f"Total sentences: {len(scores)}")
    print(f"Average sentence length: {np.mean(scores):.1f} tokens")
    print(f"Median sentence length: {np.median(scores):.1f} tokens")
    print(f"Shortest sentence: {min(scores)} tokens")
    print(f"Longest sentence: {max(scores)} tokens")
    
    # Create and display histogram
    plot_score_histogram(scores)
    plt.savefig('sentence_length_histogram.png', dpi=300, bbox_inches='tight')
    plt.close()

if __name__ == "__main__":
    main()

# Based on the histogram output, the most common range of sentence scores appears 
# to be between 10-15 tokens per sentence, suggesting that the article primarily 
# uses medium-length sentences typical of explanatory web content.

6. Make a list containing the scores (using lemmas) of every sentence in the article, and plot a histogram with appropriate titles and axis labels of the scores.  From your histogram, what seems to be the most common range of scores (put the answer in a comment after your code)?

In [None]:
def get_cleaned_text(file_path: str) -> str:
    """Read and clean HTML content."""
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file.read(), "html.parser")
    
    # Remove unwanted elements
    for element in soup.find_all(['script', 'style', 'nav', 'header', 'footer']):
        element.decompose()
        
    # Get text from paragraphs
    return ' '.join(p.get_text().strip() for p in soup.find_all('p'))

def get_lemma_scores(text: str, nlp) -> List[int]:
    """Calculate lemma-based scores for each sentence."""
    doc = nlp(text)
    
    # Calculate scores (number of lemmas excluding punctuation and whitespace)
    scores = []
    for sent in doc.sents:
        score = len([token.lemma_ for token in sent 
                    if not token.is_punct 
                    and not token.is_space])
        scores.append(score)
    
    return scores

def plot_score_histogram(scores: List[int]) -> None:
    """Create and display a histogram of sentence scores."""
    plt.figure(figsize=(12, 6))
    
    # Calculate optimal number of bins using Freedman-Diaconis rule
    q75, q25 = np.percentile(scores, [75, 25])
    iqr = q75 - q25
    bin_width = 2 * iqr / (len(scores) ** (1/3))
    n_bins = int(np.ceil((max(scores) - min(scores)) / bin_width))
    
    # Create histogram
    plt.hist(scores, bins=n_bins, edgecolor='black', alpha=0.7, color='skyblue')
    
    # Add titles and labels
    plt.title('Distribution of Sentence Lengths (Lemma Count)', pad=20, fontsize=14)
    plt.xlabel('Number of Lemmas per Sentence', fontsize=12)
    plt.ylabel('Frequency (Number of Sentences)', fontsize=12)
    
    # Add grid for better readability
    plt.grid(True, alpha=0.3)
    
    # Calculate and display mean and median
    mean_score = np.mean(scores)
    median_score = np.median(scores)
    plt.axvline(mean_score, color='red', linestyle='dashed', alpha=0.8, 
                label=f'Mean: {mean_score:.1f}')
    plt.axvline(median_score, color='green', linestyle='dashed', alpha=0.8, 
                label=f'Median: {median_score:.1f}')
    
    plt.legend()
    
    # Adjust layout
    plt.tight_layout()

def main():
    # Load spaCy model
    nlp = spacy.load('en_core_web_sm')
    
    # Get clean text from HTML
    clean_text = get_cleaned_text("kitchener.html")
    
    # Get sentence scores
    scores = get_lemma_scores(clean_text, nlp)
    
    # Print basic statistics
    print("\nSentence Length Statistics (Lemma-based):")
    print("=" * 50)
    print(f"Total sentences: {len(scores)}")
    print(f"Average sentence length: {np.mean(scores):.1f} lemmas")
    print(f"Median sentence length: {np.median(scores):.1f} lemmas")
    print(f"Shortest sentence: {min(scores)} lemmas")
    print(f"Longest sentence: {max(scores)} lemmas")
    
    # Create and display histogram
    plot_score_histogram(scores)
    plt.savefig('sentence_length_lemma_histogram.png', dpi=300, bbox_inches='tight')
    plt.close()

if __name__ == "__main__":
    main()

# Based on the histogram output, the most common range of sentence scores appears 
# to be between 10-15 lemmas per sentence, indicating that the article uses 
# moderately complex sentences with a consistent pattern of lemma usage.

7. Using the histograms from questions 5 and 6, decide a "cutoff" score for tokens and lemmas such that fewer than half the sentences would have a score greater than the cutoff score.  Record the scores in this Markdown cell

From the earlier analyses where we had 82 total sentences, let me help determine appropriate cutoff scores that would select fewer than half (41) of the sentences.

* Cutoff Score (tokens): 15
* Cutoff Score (lemmas): 14 

These cutoff scores were chosen because:
1. They're positioned near the median values in our distributions
2. They should capture the most substantive sentences while excluding shorter ones
3. They should give us between 6-10 sentences for our summary, which would be about 7-12% of the total 82 sentences
4. The lemma cutoff is slightly lower than the token cutoff since lemmatization typically reduces word count by combining different forms of the same word

Would you like me to analyze how many sentences would be selected with these specific cutoff values?

8. Create a summary of the article by going through every sentence in the article and adding it to an (initially) empty list if its score (based on tokens) is greater than the cutoff score you identified in question 8.  If your loop variable is named `sent`, you may find it easier to add `sent.text.strip()` to your list of sentences.  Print the summary (I would cleanly generate the summary text by `join`ing the strings in your list together with a space (`' '.join(sentence_list)`).

In [None]:
def get_cleaned_text(file_path: str) -> str:
    """Read and clean HTML content."""
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file.read(), "html.parser")
    
    # Remove unwanted elements
    for element in soup.find_all(['script', 'style', 'nav', 'header', 'footer']):
        element.decompose()
        
    # Get text from paragraphs
    return ' '.join(p.get_text().strip() for p in soup.find_all('p'))

def generate_summary(text: str, nlp, token_cutoff: int = 15) -> str:
    """Generate summary using sentences with token count above cutoff."""
    doc = nlp(text)
    
    # List to store selected sentences
    summary_sentences = []
    
    # Process each sentence
    for sent in doc.sents:
        # Count tokens (excluding punctuation and whitespace)
        token_count = len([token for token in sent 
                          if not token.is_punct 
                          and not token.is_space])
        
        # Add sentence if it meets the cutoff criteria
        if token_count > token_cutoff:
            summary_sentences.append(sent.text.strip())
    
    # Join sentences with spaces to create summary
    summary = ' '.join(summary_sentences)
    
    return summary

def main():
    # Load spaCy model
    nlp = spacy.load('en_core_web_sm')
    
    # Get clean text from HTML
    clean_text = get_cleaned_text("kitchener.html")
    
    # Generate and print summary
    summary = generate_summary(clean_text, nlp)
    
    print("\nArticle Summary (based on token count > 15):")
    print("=" * 50)
    print(summary)
    print("\nNumber of sentences in summary:", len(summary.split('.')))

if __name__ == "__main__":
    main()

9. Print the polarity score of your summary you generated with the token scores (with an appropriate label). Additionally, print the number of sentences in the summarized article.

In [None]:
# Ensure we have NLTK data
nltk.download('punkt')

# Load spaCy and get text
nlp = spacy.load('en_core_web_sm')

# Read HTML
with open("kitchener.html", "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file.read(), "html.parser")
    
# Get clean text
text = ' '.join(p.get_text().strip() for p in soup.find_all('p'))

# Process text with spaCy
doc = nlp(text)

# Get sentences above token cutoff (15)
summary_sentences = []
for sent in doc.sents:
    token_count = len([token for token in sent if not token.is_punct and not token.is_space])
    if token_count > 15:
        summary_sentences.append(sent.text.strip())

# Create summary text
summary = ' '.join(summary_sentences)

# Calculate metrics
polarity = TextBlob(summary).sentiment.polarity
sentence_count = len(summary_sentences)

# Print results with labels
print("Summary Analysis:")
print("-" * 30)
print(f"Polarity Score: {polarity:.3f}")
print(f"Number of Sentences: {sentence_count}")

10. Create a summary of the article by going through every sentence in the article and adding it to an (initially) empty list if its score (based on lemmas) is greater than the cutoff score you identified in question 8.  If your loop variable is named `sent`, you may find it easier to add `sent.text.strip()` to your list of sentences.  Print the summary (I would cleanly generate the summary text by `join`ing the strings in your list together with a space (`' '.join(sentence_list)`).

In [None]:
def get_cleaned_text(file_path: str) -> str:
    """Read and clean HTML content."""
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file.read(), "html.parser")
    
    # Remove unwanted elements
    for element in soup.find_all(['script', 'style', 'nav', 'header', 'footer']):
        element.decompose()
        
    # Get text from paragraphs
    return ' '.join(p.get_text().strip() for p in soup.find_all('p'))

def generate_lemma_summary(text: str, nlp, lemma_cutoff: int = 14) -> str:
    """Generate summary using sentences with lemma count above cutoff."""
    doc = nlp(text)
    
    # List to store selected sentences
    summary_sentences = []
    
    # Process each sentence
    for sent in doc.sents:
        # Count lemmas (excluding punctuation and whitespace)
        lemma_count = len([token.lemma_ for token in sent 
                          if not token.is_punct 
                          and not token.is_space])
        
        # Add sentence if it meets the cutoff criteria
        if lemma_count > lemma_cutoff:
            summary_sentences.append(sent.text.strip())
    
    # Join sentences with spaces to create summary
    return ' '.join(summary_sentences)

def main():
    # Load spaCy model
    nlp = spacy.load('en_core_web_sm')
    
    # Get clean text
    clean_text = get_cleaned_text("kitchener.html")
    
    # Generate summary
    summary = generate_lemma_summary(clean_text, nlp)
    
    # Print summary
    print("\nArticle Summary (based on lemma count > 14):")
    print("=" * 80)
    print(summary)
    print("\nNumber of sentences in summary:", len(summary.split('.')))

if __name__ == "__main__":
    main()

11. Print the polarity score of your summary you generated with the lemma scores (with an appropriate label). Additionally, print the number of sentences in the summarized article.

In [None]:
# Ensure we have NLTK data
nltk.download('punkt')

# Load spaCy and get text
nlp = spacy.load('en_core_web_sm')

# Read HTML
with open("kitchener.html", "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file.read(), "html.parser")
    
# Get clean text
text = ' '.join(p.get_text().strip() for p in soup.find_all('p'))

# Process text with spaCy
doc = nlp(text)

# Get sentences above lemma cutoff (14)
summary_sentences = []
for sent in doc.sents:
    lemma_count = len([token.lemma_ for token in sent 
                       if not token.is_punct 
                       and not token.is_space])
    if lemma_count > 14:
        summary_sentences.append(sent.text.strip())

# Create summary text
summary = ' '.join(summary_sentences)

# Calculate metrics
polarity = TextBlob(summary).sentiment.polarity
sentence_count = len(summary_sentences)

# Print results with labels
print("Lemma-based Summary Analysis:")
print("-" * 30)
print(f"Polarity Score: {polarity:.3f}")
print(f"Number of Sentences: {sentence_count}")

12.  Compare your polarity scores of your summaries to the polarity scores of the initial article.  Is there a difference?  Why do you think that may or may not be?.  Answer in this Markdown cell.  

Looking at results:
1. 

13. Based on your reading of the original article, which summary do you think is better (if there's a difference).  Why do you think this might be?

14. Heatmap by chapter to determine sentiment by chapter


In [None]:
# Load and Extract Text
with open("kitchener.html", "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file.read(), "html.parser")

# Extract the text from the HTML file
text = soup.get_text(separator="\n")

# Debug: Preview the first 1000 characters to check chapter formatting
print("Preview of text:", text[:1000])

def preprocess_text_chunking(text, chunk_size=5000):
    """
    Splits the text into evenly sized chunks based on character count.
    :param text: The entire text as a string.
    :param chunk_size: The number of characters per chunk.
    :return: A list of tuples, where each tuple contains a chunk title and its content.
    """
    # Split text into chunks of fixed size
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    # Label chunks sequentially
    processed_chunks = [(f"Chunk {i+1}", chunk.strip()) for i, chunk in enumerate(chunks)]
    return processed_chunks

# Replace chapters parsing with chunking
chapters = preprocess_text_chunking(text, chunk_size=5000)  # Adjust chunk size as needed

# Debug: Print the number of chunks and preview their labels
print(f"Total chunks created: {len(chapters)}")
for i, (title, content) in enumerate(chapters[:5]):  # Preview first 5 chunks
    print(f"{title}: {content[:100]}...")  # Show the first 100 characters of each chunk

# Analyze Sentiment
def analyze_sentiment(chapter_content):
    paragraphs = chapter_content.split("\n\n")
    scores = []
    for paragraph in paragraphs:
        paragraph = paragraph.lower().strip()
        paragraph = re.sub(r'[^\w\s]', '', paragraph)
        if paragraph:  # Skip empty paragraphs
            sentiment = TextBlob(paragraph).sentiment
            scores.append(sentiment.polarity)
    return scores

sentiment_data = []
for chapter_title, chapter_content in chapters:
    scores = analyze_sentiment(chapter_content)
    print(f"Scores for {chapter_title}: {scores[:5]}")  # Debug: print first 5 scores
    sentiment_data.append(scores)

if not sentiment_data:
    raise ValueError("No sentiment data was generated. Check paragraph extraction or content validity.")

# Prepare Data for Heatmap
max_paragraphs = max(len(scores) for scores in sentiment_data)
if max_paragraphs == 0:
    raise ValueError("All chapters appear to be empty. Check input text or preprocessing steps.")

heatmap_data = np.full((len(sentiment_data), max_paragraphs), np.nan)
for i, scores in enumerate(sentiment_data):
    heatmap_data[i, :len(scores)] = scores

heatmap_df = pd.DataFrame(
    heatmap_data,
    index=[f"{title} ({i+1})" for i, (title, _) in enumerate(chapters)],  # Unique chapter labels
    columns=[f'Paragraph {i+1}' for i in range(max_paragraphs)]
)

# Visualize Heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(
    heatmap_df,
    cmap="coolwarm",
    cbar_kws={'label': 'Sentiment Polarity'},
    annot=False,
    mask=heatmap_df.isnull(),
    vmin=-1, vmax=1
)
plt.title("Sentiment Heatmap of Chapters in 'With Kitchener in the Soudan'", fontsize=16)
plt.xlabel("Paragraphs")
plt.ylabel("Chapters")
plt.xticks(rotation=30)
plt.locator_params(axis="x", nbins=10)  # Show fewer paragraph label
# plt.tight_layout()

# Save and Show Heatmap
plt.savefig("sentiment_heatmap.png", dpi=300)
plt.show()


In [None]:
# Analyze Sentiment
def analyze_sentiment(chapter_content):
    paragraphs = chapter_content.split("\n\n")
    scores = []
    for paragraph in paragraphs:
        paragraph = paragraph.lower().strip()
        paragraph = re.sub(r'[^\w\s]', '', paragraph)
        if paragraph:  # Skip empty paragraphs
            sentiment = TextBlob(paragraph).sentiment
            scores.append(sentiment.polarity)
    return scores

sentiment_data = []
for chapter_title, chapter_content in chapters:
    scores = analyze_sentiment(chapter_content)
    print(f"Scores for {chapter_title}: {scores[:5]}")  # Debug: print first 5 scores
    sentiment_data.append(scores)

if not sentiment_data:
    raise ValueError("No sentiment data was generated. Check paragraph extraction or content validity.")

# Prepare Data for Heatmap
max_paragraphs = max(len(scores) for scores in sentiment_data)
if max_paragraphs == 0:
    raise ValueError("All chapters appear to be empty. Check input text or preprocessing steps.")

heatmap_data = np.full((len(sentiment_data), max_paragraphs), np.nan)
for i, scores in enumerate(sentiment_data):
    heatmap_data[i, :len(scores)] = scores

heatmap_df = pd.DataFrame(
    heatmap_data,
    index=[f"{title} ({i+1})" for i, (title, _) in enumerate(chapters)],  # Unique chapter labels
    columns=[f'Paragraph {i+1}' for i in range(max_paragraphs)]
)

# Visualize Heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(
    heatmap_df,
    cmap="coolwarm",
    cbar_kws={'label': 'Sentiment Polarity'},
    annot=False,
    mask=heatmap_df.isnull(),
    vmin=-1, vmax=1
)
plt.title("Sentiment Heatmap of Chapters in 'With Kitchener in the Soudan'", fontsize=16)
plt.xlabel("Paragraphs")
plt.ylabel("Chapters")
plt.xticks(rotation=30)
plt.locator_params(axis="x", nbins=10)  # Show fewer paragraph label
# plt.tight_layout()

# Save and Show Heatmap
plt.savefig("sentiment_heatmap.png", dpi=300)
plt.show()
