In [61]:
# # 20 Newsgroups Dataset Preprocessing Tutorial

# This notebook demonstrates how to preprocess the 20 Newsgroups dataset for NLP tasks. We'll walk through each step of cleaning and preparing the text data, including:

# 1. Loading and parsing the raw text files
# 2. Cleaning the text (removing punctuation, numbers, emails, URLs)
# 3. Tokenization
# 4. Stopword removal
# 5. Lemmatization
# 6. Creating a final clean DataFrame

# The output will be a CSV file containing processed text data ready for NLP tasks.

# ## Setup and Dependencies

# First, let's install and import the required packages. We'll use:
# - pandas: for data manipulation
# - nltk: for text processing
# - spacy: for advanced NLP tasks
# - re: for regular expressions

In [62]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import nltk
import spacy
from pathlib import Path

# Download required NLTK data
print("Downloading required NLTK data...")
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
print("NLTK downloads complete!")

# Verify punkt tokenizer is available
try:
    nltk.data.find('tokenizers/punkt')
    print("Punkt tokenizer is available!")
except LookupError:
    print("Retrying Punkt download...")
    nltk.download('punkt', quiet=False)

# Load English language model for spaCy
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    print("Downloading spaCy English model...")
    !python -m spacy download en_core_web_sm
    nlp = spacy.load('en_core_web_sm')

# Import specific NLTK modules
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize lemmatizer and get stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

Downloading required NLTK data...


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\msabdelnasser\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\msabdelnasser\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\msabdelnasser\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\msabdelnasser\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


NLTK downloads complete!
Punkt tokenizer is available!


## Data Loading and Parsing

Now we'll load and parse the dataset. The data is organized as follows:
1. A `list.csv` file mapping document IDs to newsgroups
2. 20 text files, each containing multiple documents
3. Each document has a header with newsgroup, ID, author, and subject
4. The body text follows until the next document header

Let's start by reading the list.csv file and creating functions to parse the text files.

In [63]:
# Read the document ID to newsgroup mapping
df_list = pd.read_csv(r'C:\Projects\Training\AI-project-structured-code-template\NLP\data\list.csv')
print("Number of documents:", len(df_list))
print("\nSample of list.csv:")
print(df_list.head())

Number of documents: 628

Sample of list.csv:
            newsgroup  document_id
0  talk.religion.misc        82757
1  talk.religion.misc        82758
2  talk.religion.misc        82759
3  talk.religion.misc        82760
4  talk.religion.misc        82763


In [64]:
import re
import pandas as pd
from pathlib import Path
from typing import List, Tuple, Dict

def read_file(file_path: str) -> List[str]:
    """
    Read the file and return its contents as a list of lines.

    Args:
        file_path: Path to the newsgroup file

    Returns:
        List of strings, each string being a line from the file
    """
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        return f.readlines()

def is_header_start(line: str) -> bool:
    """
    Check if a line is the start of a message (starts with 'Newsgroup:').

    Args:
        line: The line to check

    Returns:
        bool: True if this is a message header start
    """
    return line.strip().startswith('Newsgroup:')

def get_newsgroup_from_path(file_path: str) -> str:
    """
    Extract newsgroup name from the file path.

    Args:
        file_path: Path to the newsgroup file

    Returns:
        str: Name of the newsgroup
    """
    return Path(file_path).stem

def extract_messages(lines: List[str], newsgroup: str) -> List[Tuple[str, str]]:
    """
    Extract individual messages from the file content.

    Args:
        lines: List of lines from the file
        newsgroup: Name of the newsgroup

    Returns:
        List of tuples (newsgroup, message_body)
    """
    messages = []
    current_message = []
    in_message = False

    for i in range(len(lines)):
        current_line = lines[i].rstrip()  # Remove trailing whitespace

        # Check if this is a new message header
        if is_header_start(current_line):
            if in_message and current_message:
                # Save the previous message
                messages.append((newsgroup, ''.join(current_message)))
                current_message = []
            in_message = True
            current_message = [current_line + '\n']  # Start new message
        elif in_message:
            current_message.append(current_line + '\n')

    # Don't forget the last message
    if current_message:
        messages.append((newsgroup, ''.join(current_message)))

    return messages

def parse_document(file_path: str) -> pd.DataFrame:
    """
    Process a newsgroup file and return a DataFrame with newsgroup and body columns.

    Args:
        file_path: Path to the newsgroup file

    Returns:
        pandas DataFrame with columns ['newsgroup', 'body']
    """
    # Get newsgroup name from file path
    newsgroup = get_newsgroup_from_path(file_path)

    # Read the file
    lines = read_file(file_path)

    # Extract messages
    messages = extract_messages(lines, newsgroup)

    # Create DataFrame
    df = pd.DataFrame(messages, columns=['newsgroup', 'body'])

    # Display some statistics
    print(f"File: {Path(file_path).name}")
    print(f"Total messages found: {len(df)}")
    if len(df) > 0:
        print("\nFirst message preview:")
        print("Newsgroup:", df.iloc[0]['newsgroup'])
        print("Message start (first 200 chars):", df.iloc[0]['body'][:200])

    return df

## Text Cleaning

Now we'll implement text cleaning functions to:
1. Convert text to lowercase
2. Remove email addresses
3. Remove URLs
4. Remove punctuation and numbers
5. Remove extra whitespace

We'll create a pipeline of cleaning functions that can be applied to both subject and body text.

In [66]:
def clean_text(text):
    """Apply all cleaning steps to the text."""
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)

    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

# Apply cleaning to body text
df['body_clean'] = df['body'].apply(clean_text)

print("Sample cleaned text:")
print("\nOriginal body (first 200 chars):", df.iloc[0]['body'][:200])
print("Cleaned body (first 200 chars):", df.iloc[0]['body_clean'][:200])

Sample cleaned text:

Original body (first 200 chars): From: mathew <mathew@mantis.co.uk>
Subject: Alt.Atheism FAQ: Atheist Resources

Archive-name: atheism/resources
Alt-atheism-archive-name: resources
Last-modified: 11 December 1992
Version: 1.0

      
Cleaned body (first 200 chars): from mathew subject alt atheism faq atheist resources archive name atheism resources alt atheism archive name resources last modified december version atheist resources addresses of atheist organizati


## Tokenization and Stopword Removal

Now we'll tokenize the cleaned text and remove stopwords. We'll:
1. Split text into individual tokens
2. Remove stopwords
3. Store the tokens in new columns

In [84]:
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\msabdelnasser\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [86]:
def tokenize_and_remove_stopwords(text):
    """Tokenize text and remove stopwords using basic string splitting."""
    if not isinstance(text, str):
        return []

    # Simple tokenization by splitting on whitespace
    tokens = text.split()

    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]

    return tokens

# Apply tokenization to cleaned text
df['body_tokens'] = df['body_clean'].apply(tokenize_and_remove_stopwords)

print("Sample tokenization:")
print("\nCleaned body (first 100 chars):", df.iloc[0]['body_clean'][:100])
print("First 20 tokens:", df.iloc[0]['body_tokens'][:20])

Sample tokenization:

Cleaned body (first 100 chars): from mathew subject alt atheism faq atheist resources archive name atheism resources alt atheism arc
First 20 tokens: ['mathew', 'subject', 'alt', 'atheism', 'faq', 'atheist', 'resources', 'archive', 'name', 'atheism', 'resources', 'alt', 'atheism', 'archive', 'name', 'resources', 'last', 'modified', 'december', 'version']


## Lemmatization

Now we'll apply lemmatization to reduce words to their base form. For example:
- "running" → "run"
- "better" → "good"
- "was" → "be"

We'll use NLTK's WordNetLemmatizer, which provides more accurate results than simple stemming.

In [87]:
def lemmatize_tokens(tokens):
    """Apply lemmatization to a list of tokens."""
    if not isinstance(tokens, list):
        return []
    return [lemmatizer.lemmatize(token) for token in tokens]

# Apply lemmatization
df['body_lemmas'] = df['body_tokens'].apply(lemmatize_tokens)

# Show example
print("Lemmatization example:")
print("\nFirst 20 original tokens:", df.iloc[0]['body_tokens'][:20])
print("First 20 lemmatized tokens:", df.iloc[0]['body_lemmas'][:20])

Lemmatization example:

First 20 original tokens: ['mathew', 'subject', 'alt', 'atheism', 'faq', 'atheist', 'resources', 'archive', 'name', 'atheism', 'resources', 'alt', 'atheism', 'archive', 'name', 'resources', 'last', 'modified', 'december', 'version']
First 20 lemmatized tokens: ['mathew', 'subject', 'alt', 'atheism', 'faq', 'atheist', 'resource', 'archive', 'name', 'atheism', 'resource', 'alt', 'atheism', 'archive', 'name', 'resource', 'last', 'modified', 'december', 'version']


## Create Final Dataset

Now we'll prepare the final dataset with the columns we want to keep:
- document_id
- newsgroup
- subject
- body_clean
- tokens (from body)

We'll join the tokens back into strings for easier handling in the CSV file.

In [89]:
# Create final DataFrame with selected columns
final_df = df[['newsgroup', 'body_clean']].copy()

# Add tokens column (joined into string for CSV storage)
final_df['tokens'] = df['body_lemmas'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

# Display info about the final dataset
print("Final dataset info:")
print("\nShape:", final_df.shape)
print("\nColumns:", final_df.columns.tolist())
print("\nSample row:")
print(final_df.iloc[0])

# Save to CSV
output_path = 'preprocessed_news.csv'
final_df.to_csv(output_path, index=False)
print(f"\nDataset saved to {output_path}")

# Display first few rows of the saved dataset
print("\nFirst few rows of the saved dataset:")
pd.read_csv(output_path).head()

Final dataset info:

Shape: (37662, 3)

Columns: ['newsgroup', 'body_clean', 'tokens']

Sample row:
newsgroup                                           alt.atheism
body_clean    from mathew subject alt atheism faq atheist re...
tokens        mathew subject alt atheism faq atheist resourc...
Name: 0, dtype: object

Dataset saved to preprocessed_news.csv

First few rows of the saved dataset:

Dataset saved to preprocessed_news.csv

First few rows of the saved dataset:


Unnamed: 0,newsgroup,body_clean,tokens
0,alt.atheism,from mathew subject alt atheism faq atheist re...,mathew subject alt atheism faq atheist resourc...
1,alt.atheism,alt atheism document_id from mathew subject al...,alt atheism document_id mathew subject alt ath...
2,alt.atheism,alt atheism document_id from mathew subject al...,alt atheism document_id mathew subject alt ath...
3,alt.atheism,alt atheism document_id from benedikt rosenau ...,alt atheism document_id benedikt rosenau subje...
4,alt.atheism,alt atheism document_id from mathew subject re...,alt atheism document_id mathew subject univers...
