In [1]:
# Imports
import pandas as pd
import re
from sklearn.model_selection import train_test_split

In [2]:
# Load data
output_cleaned = '../../data/summarization/processed_train.csv'
output_train = '../../data/summarization/processed_train_split.csv'
output_val = '../../data/summarization/processed_val_split.csv'
data_path = '../../data/summarization/train.csv'
df = pd.read_csv(data_path)

# Inspect a sample
print(df[['article', 'highlights']].iloc[0])

article       By . Associated Press . PUBLISHED: . 14:11 EST...
highlights    Bishop John Folda, of North Dakota, is taking ...
Name: 0, dtype: object


In [3]:
# Clean text function
def clean_text(text):
    text = str(text).replace('\n', ' ').lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r"[^a-z0-9\s.,']", " ", text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_article'] = df['article'].apply(clean_text)
df['clean_highlights'] = df['highlights'].apply(clean_text)

In [4]:
# Remove empty or obviously bad samples
df = df[(df['clean_article'].str.len() > 0) & (df['clean_highlights'].str.len() > 0)]

print(f"Total cleaned samples: {len(df)}")

Total cleaned samples: 287113


In [5]:
# Save
df[['id', 'clean_article', 'clean_highlights']].to_csv(output_cleaned, index=False)

print(f"Cleaned dataset saved to: {output_cleaned}")

Cleaned dataset saved to: ../../data/summarization/processed_train.csv


In [6]:
# Split into train/val
train_df, val_df = train_test_split(
    df[['id', 'clean_article', 'clean_highlights']],
    test_size=0.15,
    random_state=42
)

train_df.to_csv(output_train, index=False)
val_df.to_csv(output_val, index=False)

In [8]:
# Summary
print(f"\nTraining samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")

print("\nData cleaning and splitting complete!")


Training samples: 244046
Validation samples: 43067

Data cleaning and splitting complete!
