In [1]:
import pandas as pd
import csv
import numpy as np
import re
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [2]:
def load_data(file_path):
    messages = pd.read_csv('SMSSpamCollection', sep='\t', names=["label", "message"])
    return messages


def preprocess_data(df):
    """Preprocess the messages: Lowercase, remove non-alphabet characters, and tokenize."""
    # Lowercase and remove non-alphabetic characters
    df['message'] = df['message'].apply(lambda x: ' '.join(re.findall(r'\b[a-zA-Z]+\b', x.lower())))
    return df


def split_data(df):
    """Split the data into train, validation, and test sets."""
    train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
    train, validation = train_test_split(train, test_size=0.1, random_state=42, stratify=train['label'])
    return train, validation, test
    

def save_data_splits(train, validation, test):
    """Save the train, validation, and test splits into CSV files."""
    train.to_csv('train.csv', index=False)
    validation.to_csv('validation.csv', index=False)
    test.to_csv('test.csv', index=False)

In [3]:
data = load_data('SMSSpamCollection')
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data = preprocess_data(data)
train, validation, test = split_data(data)
save_data_splits(train, validation, test)