## Creating CSV

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')


In [None]:
import pandas as pd
dataset_path = '/content/gdrive/My Drive/Keerthana/reduced_dataset.csv'
df = pd.read_csv(dataset_path)

In [None]:
import pandas as pd
dataset_path = 'reduced_dataset.csv'
df = pd.read_csv(dataset_path)

In [None]:
df.head()

In [None]:
df.rename(columns={'cleaned_text': 'Article'}, inplace=True)

In [None]:
df.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)
df.drop('Unnamed: 0.1', axis=1, inplace=True)
df.drop('Clean_text', axis=1, inplace=True)

In [None]:
df.iloc[0]

In [None]:
df.to_csv('/content/gdrive/My Drive/Keerthana/data_reduced.csv', index=False)

In [None]:
df.to_csv('data_reduced.csv', index=False)

## Proprocessing of Articles for generation. To maintain meaning as cleaned_text and complete cleaning as search_text

In [None]:
pip install contractions

In [None]:
pip install nltk

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import contractions
import html
from tqdm import tqdm
import logging
from typing import Optional

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sunder\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sunder\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sunder\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Sunder\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
class StreamlinedNewsPreprocessor:
    def __init__(self):
        """
        Initialize the preprocessor with minimal required NLTK components
        """
        # Download required NLTK data
        try:
            nltk.download('punkt')
            nltk.download('stopwords')
            nltk.download('wordnet')
        except:
            pass

        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def basic_clean(self, text: str) -> str:
        """
        Basic cleaning for generation - preserves meaning and stop words
        """
        if not isinstance(text, str):
            return ""

        # Convert to lowercase
        text = text.lower().strip()

        # Decode HTML entities
        text = html.unescape(text)

        # Expand contractions
        text = contractions.fix(text)

        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)

        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)

        # Remove special characters but keep sentence structure
        text = re.sub(r'[^\w\s.,!?]', ' ', text)

        # Remove multiple spaces and newlines
        text = ' '.join(text.split())

        return text

    def search_clean(self, text: str) -> str:
        """
        Aggressive cleaning for semantic search - removes stop words and lemmatizes
        """
        if not isinstance(text, str):
            return ""

        # Basic cleaning first
        text = self.basic_clean(text)

        # Tokenize
        tokens = word_tokenize(text)

        # Remove stop words and lemmatize
        tokens = [self.lemmatizer.lemmatize(token.lower()) for token in tokens
                 if token.lower() not in self.stop_words]

        return ' '.join(tokens)

    def preprocess_dataframe(self,
                           df: pd.DataFrame,
                           text_column: str,
                           batch_size: int = 1000,
                           output_path: Optional[str] = None) -> pd.DataFrame:
        """
        Preprocess the dataframe keeping only essential columns for generation and search

        Parameters:
        -----------
        df : pd.DataFrame
            Input dataframe containing articles
        text_column : str
            Name of the column containing article text
        batch_size : int
            Size of batches for processing
        output_path : Optional[str]
            If provided, saves the processed dataframe to this path

        Returns:
        --------
        pd.DataFrame
            Processed dataframe with only cleaned_text and search_text columns
        """
        print("Starting preprocessing...")

        # Create new dataframe with only required columns
        processed_df = pd.DataFrame()

        # Process in batches
        for i in tqdm(range(0, len(df), batch_size)):
            batch = df[text_column].iloc[i:i+batch_size].copy()

            # Create temporary dataframe for batch processing
            temp_df = pd.DataFrame({
                'cleaned_text': batch.apply(self.basic_clean),
                'search_text': batch.apply(self.search_clean)
            })

            # Append to processed dataframe
            processed_df = pd.concat([processed_df, temp_df], ignore_index=True)

        # Remove rows where either column is empty
        processed_df = processed_df.dropna(subset=['cleaned_text', 'search_text'])
        processed_df = processed_df[processed_df['cleaned_text'].str.strip() != '']
        processed_df = processed_df[processed_df['search_text'].str.strip() != '']

        self.logger.info(f"Preprocessing completed! Shape: {processed_df.shape}")

        # Save if output path is provided
        if output_path:
            processed_df.to_csv(output_path, index=False)
            self.logger.info(f"Processed data saved to {output_path}")

        return processed_df

In [None]:
def verify_processed_data(df: pd.DataFrame, sample_size: int = 3) -> None:
    """
    Verify the processed data by printing sample comparisons
    """
    print("\nSample Comparisons (Original vs Cleaned vs Search):")
    samples = df.sample(n=min(sample_size, len(df)))

    for idx, row in samples.iterrows():
        print("\n" + "="*80)
        print(f"Cleaned Text: {row['cleaned_text'][:200]}...")
        print(f"Search Text: {row['search_text'][:200]}...")
        print("="*80)

In [None]:
if __name__ == "__main__":
    # Initialize preprocessor
    preprocessor = StreamlinedNewsPreprocessor()

    # Read data
    dataset_path = 'data_reduced.csv'
    df = pd.read_csv(dataset_path)

    # Preprocess data
    processed_df = preprocessor.preprocess_dataframe(
        df=df,
        text_column='Article',
        output_path='reduced_processed_news_essential.csv'
    )

    # Verify the processed data
    verify_processed_data(processed_df)

    print("\nProcessed DataFrame Info:")
    print(processed_df.info())

    # Memory usage
    memory_usage = processed_df.memory_usage(deep=True).sum() / 1024**2
    print(f"\nMemory Usage: {memory_usage:.2f} MB")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sunder\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sunder\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sunder\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Starting preprocessing...


100%|██████████| 165/165 [08:54<00:00,  3.24s/it]
INFO:__main__:Preprocessing completed! Shape: (164511, 2)
INFO:__main__:Processed data saved to reduced_processed_news_essential.csv



Sample Comparisons (Original vs Cleaned vs Search):

Cleaned Text: by . sara smyth . published . est december . . updated . est december . modernising once reliant on victorian buildings prison governors now have phones in every cell . prisoners in newly built jails ...
Search Text: . sara smyth . published . est december . . updated . est december . modernising reliant victorian building prison governor phone every cell . prisoner newly built jail phone computer terminal cell ab...

Cleaned Text: foreign portfolio investors have bought . billion around crore worth of indian equities in march so far the highest in months. india received the third largest inflow in asia after taiwan and south ko...
Search Text: foreign portfolio investor bought . billion around crore worth indian equity march far highest month . india received third largest inflow asia taiwan south korea march . earlier year fpis pulled . bi...

Cleaned Text: indias largest two wheeler manufacturer hero motocorp on thu

# Preprocessing according to BART generation

In [None]:
import pandas as pd
import re
from pathlib import Path
import json
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:


class NewsDataPreprocessor:
    def __init__(self, input_file, output_dir):
        self.input_file = input_file
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def clean_text(self, text):
        """Basic text cleaning"""
        if not isinstance(text, str):
            return ""

        # Remove formatting artifacts and metadata
        patterns = [
            (r'\s*\.\s*\|\s*\.\s*', ' '), # Remove '. | .'
            (r'By\s*\.\s*[^.]*\.\s*PUBLISHED:\s*[^.]*\.\s*UPDATED:[^.]*\d{4}\s*', ''), # Remove metadata
            (r'\s+', ' '), # Remove extra whitespace
            (r'[\n\t]', ' '), # Remove newlines and tabs
            (r'\s+\.', '.'), # Fix spaces before periods
            (r'\s+,', ','), # Fix spaces before commas
            (r'\s*"\s*', '"'), # Standardize quotes spacing
            (r'\s*\'\s*', "'"), # Standardize apostrophes (fixed)
            (r'\.+', '.'), # Remove multiple periods
        ]

        for pattern, replacement in patterns:
            text = re.sub(pattern, replacement, text)

        return text.strip()

    def clean_highlights(self, highlights):
        """Clean highlights specifically"""
        if not isinstance(highlights, str):
            return ""

        # Remove bullet points and merge lines
        highlights = re.sub(r'^\s*•\s*', '', highlights)
        highlights = ' '.join(highlights.split('\n'))
        highlights = self.clean_text(highlights)

        # Remove trailing dots and ensure proper punctuation
        highlights = re.sub(r'\s*\.\s*$', '', highlights)
        highlights = re.sub(r'\s*\.\s*(?=\S)', '. ', highlights)

        return highlights.strip()

    def extract_key_phrases(self, highlight):
        """Extract key phrases from highlight"""
        # Extract main topics/entities
        key_words = set()
        # Add main nouns/topics
        key_words.update(re.findall(r'\b[A-Z][a-z]+\b', highlight))
        # Add important words
        key_words.update(re.findall(r'\b\w+(?:essay|news|photo|report)\b', highlight.lower()))
        return ' '.join(list(key_words)[:3])

    def create_bart_format(self, row):
        """Create multiple training examples with varying input lengths"""
        cleaned_article = self.clean_text(row['article'])
        full_highlight = self.clean_highlights(row['highlights'])

        if len(cleaned_article) < 50:  # Minimum article length
            return None

        training_examples = []

        # 1. Full highlight version
        if len(full_highlight) >= 10:
            training_examples.append({
                'input_text': full_highlight,
                'target_text': cleaned_article
            })

        # 2. Short keyword version
        keywords = self.extract_key_phrases(full_highlight)
        if len(keywords) >= 3:
            training_examples.append({
                'input_text': keywords,
                'target_text': cleaned_article
            })

        # 3. Topic-only version
        main_topic = ' '.join(re.findall(r'\b[A-Z][a-z]+\b', full_highlight)[:2])
        if main_topic:
            training_examples.append({
                'input_text': main_topic + ' news',
                'target_text': cleaned_article
            })

        return training_examples

    def process_and_split_data(self, test_size=0.1, val_size=0.1, random_state=42):
        """Process the data and split into train/val/test sets"""
        print("Reading data...")
        df = pd.read_csv(self.input_file)

        print("Preprocessing articles and highlights...")
        processed_data = []
        for _, row in tqdm(df.iterrows(), total=len(df)):
            processed_item = self.create_bart_format(row)
            if processed_item:
                processed_data.extend(processed_item)

        # First split: separate test set
        train_val_data, test_data = train_test_split(
            processed_data,
            test_size=test_size,
            random_state=random_state
        )

        # Second split: separate validation set from training set
        val_adjusted_size = val_size / (1 - test_size)
        train_data, val_data = train_test_split(
            train_val_data,
            test_size=val_adjusted_size,
            random_state=random_state
        )

        # Save splits
        splits = {
            'train': train_data,
            'validation': val_data,
            'test': test_data
        }

        for split_name, split_data in splits.items():
            output_file = self.output_dir / f'{split_name}.json'
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(split_data, f, indent=2, ensure_ascii=False)

            print(f"{split_name} set size: {len(split_data)}")

        return len(processed_data)

    def validate_splits(self):
        """Validate the created splits"""
        issues = []
        for split in ['train', 'validation', 'test']:
            file_path = self.output_dir / f'{split}.json'
            if not file_path.exists():
                issues.append(f"{split} file not found")
                continue

            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            # Validate sample entries
            for idx, item in enumerate(data[:5]):
                if 'input_text' not in item or 'target_text' not in item:
                    issues.append(f"Missing required fields in {split} set, item {idx}")
                if not item['input_text'] or not item['target_text']:
                    issues.append(f"Empty text in {split} set, item {idx}")

        return issues



In [None]:
# Usage example
def main():
    # Configure paths
    input_file = 'train.csv'  # Update with your input file path
    output_dir = 'processed_data'        # Update with your desired output directory

    # Initialize preprocessor
    preprocessor = NewsDataPreprocessor(input_file, output_dir)

    # Process and split data
    total_processed = preprocessor.process_and_split_data()
    print(f"\nTotal processed examples: {total_processed}")

    # Validate the splits
    issues = preprocessor.validate_splits()
    if issues:
        print("\nValidation issues found:")
        for issue in issues:
            print(f"- {issue}")
    else:
        print("\nAll splits validated successfully!")

if __name__ == "__main__":
    main()

Reading data...
Preprocessing articles and highlights...


100%|██████████| 287113/287113 [02:53<00:00, 1656.82it/s]


train set size: 688864
validation set size: 86109
test set size: 86109

Total processed examples: 861082

All splits validated successfully!
