<a href="https://colab.research.google.com/github/Hirwabrian/Group19-Machine_Learning_Techniques_I/blob/main/Group19_collab_book.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import re
import string

def preprocess_ag_news(file_path):
    # 1. Load data - skipping the header row
    df = pd.read_csv(file_path, header=0, names=['label', 'title', 'description'], engine='python')

    # 2. Map numeric labels to names for better visualization
    label_map = {1: 'World', 2: 'Sports', 3: 'Business', 4: 'Sci/Tech'}
    df['class_name'] = df['label'].map(label_map)

    # 3. Combine Title and Description
    df['text'] = df['title'] + " " + df['description']

    def clean_text(text):
        # Lowercase
        text = text.lower()
        # Remove backslash escapes like \n or \b
        text = re.sub(r'\\[nb]', ' ', text)
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove extra whitespace
        text = ' '.join(text.split())
        return text

    print(f"Cleaning {file_path}...")
    df['text'] = df['text'].apply(clean_text)

    # Keep only what is necessary for the models
    return df[['text', 'label', 'class_name']]

# EXECUTION
train_cleaned = preprocess_ag_news('/content/train.csv')
test_cleaned = preprocess_ag_news('/content/test.csv')

# Save to shared CSVs
train_cleaned.to_csv('ag_news_train_cleaned.csv', index=False)
test_cleaned.to_csv('ag_news_test_cleaned.csv', index=False)

print("\nSample Output:")
print(train_cleaned.head())
print("\nClass Distribution:")
print(train_cleaned['class_name'].value_counts())

Cleaning /content/train.csv...
Cleaning /content/test.csv...

Sample Output:
                                                text  label class_name
0  wall st bears claw back into the black reuters...      3   Business
1  carlyle looks toward commercial aerospace reut...      3   Business
2  oil and economy cloud stocks outlook reuters r...      3   Business
3  iraq halts oil exports from main southern pipe...      3   Business
4  oil prices soar to alltime record posing new m...      3   Business

Class Distribution:
class_name
Business    30000
Sci/Tech    30000
Sports      30000
World       30000
Name: count, dtype: int64
