In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

def load_data(file_path):
    """
    Loads the SMSSpamCollection dataset from a given file path.
    The first word is either 'ham' or 'spam', and the rest is the message.
    """
    df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])
    return df

def preprocess_data(df):
    """
    Preprocesses the data if needed. Here, we ensure correct data types and strip spaces.
    """
    df['label'] = df['label'].str.strip()
    df['message'] = df['message'].str.strip()
    return df

def split_data(df, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42):
    """
    Splits the data into train, validation, and test sets.
    """
    train, temp = train_test_split(df, train_size=train_size, random_state=random_state, stratify=df['label'])
    val, test = train_test_split(temp, test_size=test_size/(val_size+test_size), random_state=random_state, stratify=temp['label'])
    return train, val, test

def save_splits(train, val, test, train_path="train.csv", val_path="validation.csv", test_path="test.csv"):
    """
    Saves the train, validation, and test sets to CSV files.
    """
    train.to_csv(train_path, index=False)
    val.to_csv(val_path, index=False)
    test.to_csv(test_path, index=False)