In [1]:
# Imports
import pandas as pd
import re
from sklearn.model_selection import train_test_split

In [2]:
# Load data
processed_path = '../../data/summarization/processed_train.csv'
train_path = '../../data/summarization/processed_train_split.csv'
val_path = '../../data/summarization/processed_val_split.csv'
data_path = '../../data/summarization/train.csv'
df = pd.read_csv(data_path)

# Sample ~57,000 rows randomly
df = df.sample(n=57000, random_state=42).reset_index(drop=True)

# Inspect a sample
print(df[['article', 'highlights']].iloc[0])

article       By . Mia De Graaf . Britons flocked to beaches...
highlights    People enjoyed temperatures of 17C at Brighton...
Name: 0, dtype: object


In [3]:
# Clean text function
def clean_text(text):
    text = str(text).replace('\n', ' ').lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r"[^a-z0-9\s.,']", " ", text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_article'] = df['article'].apply(clean_text)
df['clean_highlights'] = df['highlights'].apply(clean_text)

In [4]:
# Drop rows with empty cleaned fields
df = df[
    (df['clean_article'].str.len() > 0) &
    (df['clean_highlights'].str.len() > 0)
]

print(f"Total samples after sampling & cleaning: {len(df)}")

Total samples after sampling & cleaning: 57000


In [5]:
# Save
df[['id', 'clean_article', 'clean_highlights']].to_csv(processed_path, index=False)
print(f"Cleaned sample dataset saved to: {processed_path}")

Cleaned sample dataset saved to: ../../data/summarization/processed_train.csv


In [6]:
# Split into train/val
train_df, val_df = train_test_split(
    df[['id', 'clean_article', 'clean_highlights']],
    test_size=0.15,
    random_state=42
)

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)

In [7]:
# Summary
print(f"\nTraining samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")

print("\nPreprocessing complete.")


Training samples: 48450
Validation samples: 8550

Preprocessing complete.
