In [1]:
import pandas as pd
import re
from titlecase import titlecase
import spacy
from collections import Counter
from html import unescape

In [2]:
# load data
file_path = r"C:\NEU\7390AdvDS\assignment3\Jincheng_Jiang_002333686_code\AG_news\train.csv"
df = pd.read_csv(file_path)

print(df[["Title", "Description"]].head(1))

                                               Title  \
0  Wall St. Bears Claw Back Into the Black (Reuters)   

                                         Description  
0  Reuters - Short-sellers, Wall Street's dwindli...  


In [3]:
print(f"total numbers: {len(df)}")
print(f"column name: {df.columns.tolist()}")

print("\n=== first sample ===")
print(df[["Title", "Description"]].head(1))

print("\n=== lenth analysis ===")
df['title_len'] = df['Title'].apply(lambda x: len(str(x).split()))
df['desc_len'] = df['Description'].apply(lambda x: len(str(x).split()))

print(f"title average lenth: {df['title_len'].mean():.1f} ")
print(f"title max lenth: {df['title_len'].max()} ")
print(f"des average lenth: {df['desc_len'].mean():.1f} ")
print(f"des max lenth: {df['desc_len'].max()} ")

total numbers: 120000
column name: ['Class Index', 'Title', 'Description']

=== first sample ===
                                               Title  \
0  Wall St. Bears Claw Back Into the Black (Reuters)   

                                         Description  
0  Reuters - Short-sellers, Wall Street's dwindli...  

=== lenth analysis ===
title average lenth: 6.8 
title max lenth: 19 
des average lenth: 31.1 
des max lenth: 173 


# data cleaning

In [4]:
def clean_text(text):
    # remove HTML 
    text = unescape(str(text))
    text = re.sub(r'<\/?[a-zA-Z_]+\b[^>]*>', '', text, flags=re.IGNORECASE)
    text = re.sub(r'&\w+;', '', text)
    text = re.sub(r'[<>]', '', text)

    text = text.replace('"', "'")
    # Keep basic punctuation (!?.,) and remove other special characters
    allowed_punct = r"!?.,:;'"
    text = re.sub(fr"[^\w\s{allowed_punct}]", '', text)
    # Merge consecutive spaces
    text = re.sub(r"\s+([!?,.])\s+", r" \1 ", text)
    text = re.sub(r"\s{2,}", " ", text).strip()
    return text

df['clean_title'] = df['Title'].apply(clean_text)
df['clean_desc'] = df['Description'].apply(clean_text)

In [5]:
def normalize_text(text, is_title=True):
    if is_title:
        # Step 1: Keep existing all-caps words
        text = re.sub(r'\b([A-Z]{2,})\b', r'<KEEP_CAPS>\1</KEEP_CAPS>', text)
    
        # Step 2: Apply title formatting
        text = titlecase(text.lower())
    
        # Step 3: Restore reserved all-caps words
        text = re.sub(r'<KEEP_CAPS>(.*?)</KEEP_CAPS>',
                      lambda m: m.group(1).upper(), text)
        return text
    
    else:
        # lowercase text
        return text.lower()

df['norm_title'] = df['clean_title'].apply(lambda x: normalize_text(x, is_title=True))
df['norm_desc'] = df['clean_desc'].apply(lambda x: normalize_text(x, is_title=False))

In [6]:
# Filtering empty values
print(f"nums of beginning: {len(df)}")
df = df.dropna(subset=['norm_title', 'norm_desc'])

# Filtering short text
df = df[df['norm_title'].str.split().str.len() >= 3]  
df = df[df['norm_desc'].str.split().str.len() >= 10]  
print(f"nums after filtering: {len(df)}")

nums of beginning: 120000
nums after filtering: 118395


# tokenize&vocab

In [7]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "lemmatizer"])

def spacy_tokenize(text):
    doc = nlp(text.lower())
    punct_set = {".", ",", "!", "?", ":", ";", "'"}
    tokens = []
    for token in doc:
        tok_text = token.text.strip()
        if not tok_text:
            continue

        # remove spaCy stop words
        if token.is_stop:
            continue

        # remove underscore
        if '_' in tok_text:
            continue

        # if it is a punctuation, keep it
        if tok_text in punct_set:
            tokens.append(tok_text)
            continue

        # length >= 2 and only contains lowercase letters and numbers
        if re.match(r'^[a-z0-9]+$', tok_text) and len(tok_text) >= 2:
            tokens.append(tok_text)

    return tokens

df['tokenized_title'] = df['norm_title'].apply(spacy_tokenize)
df['tokenized_desc'] = df['norm_desc'].apply(spacy_tokenize)

In [8]:
def build_optimized_vocab(df, max_vocab_size=150000, min_freq=3):
    # merge Tokens
    all_tokens = []
    chunksize = 10000  # Batch processing to prevent memory overflow

    for i in range(0, len(df), chunksize):
        chunk = df.iloc[i:i+chunksize]
        all_tokens.extend(chunk['tokenized_title'].sum())
        all_tokens.extend(chunk['tokenized_desc'].sum())

    # count tokens
    counter = Counter(all_tokens)

    # Filter low-frequency words
    filtered = [
        (word, cnt) for word, cnt in counter.items()
        if cnt >= min_freq
    ]

    # Sort by word frequency in descending order
    sorted_words = [word for word, _ in sorted(filtered, key=lambda x: (-x[1], x[0]))]

    special_tokens = ['<pad>', '<unk>', '<start>', '<end>']
    vocab = special_tokens + sorted_words[:max_vocab_size - len(special_tokens)]

    return {word: idx for idx, word in enumerate(vocab)}

# construct vocab
vocab = build_optimized_vocab(df, max_vocab_size=150000, min_freq=3)
print(f"Final Vocab Size: {len(vocab)}")

Final Vocab Size: 37943


In [13]:
df = df[df['tokenized_title'].apply(lambda x: len(x)) >= 3]
df = df[df['tokenized_desc'].apply(lambda x: len(x)) >= 10]

def text_to_sequence(tokens, vocab):
    return [vocab.get(token, vocab['<unk>']) for token in ['<start>'] + tokens + ['<end>']]

df['title_seq'] = df['tokenized_title'].apply(lambda x: text_to_sequence(x, vocab))
df['desc_seq'] = df['tokenized_desc'].apply(lambda x: text_to_sequence(x, vocab))

In [14]:
processed_path = r"C:\NEU\7390AdvDS\assignment3\Jincheng_Jiang_002333686_code\AG_news\processed_data.parquet"
df[['norm_title', 'norm_desc', 'title_seq', 'desc_seq']].to_parquet(processed_path)

vocab_path = r"C:\NEU\7390AdvDS\assignment3\Jincheng_Jiang_002333686_code\AG_news\vocab.csv"
pd.DataFrame(vocab.items(), columns=['word', 'id']).to_csv(vocab_path, index=False)