In [35]:
import pandas as pd
import re
from pathlib import Path
%matplotlib inline
import matplotlib.pyplot as plt

In [36]:
# Load the raw dataset
df = pd.read_csv("raw data.csv", encoding="latin-1")
print(f"Shape: {df.shape}")
print("First 5 rows:")
print(df.head(5))

Shape: (5572, 5)
First 5 rows:
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [37]:
# Rename the two columns: v1 = spam/ham labels, v2 = content of message
df = df[["v1", "v2"]].rename(columns={"v1": "Label", "v2": "Content"})
print(f"Shape: {df.shape}")
print("First 3 rows:")
print(df.head(3))

Shape: (5572, 2)
First 3 rows:
  Label                                            Content
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...


In [38]:
# Lowercase Title-case words or mixed case, keep FULL UPPERCASE words unchanged
def filter_case(text):
    def replace(match):
        word = match.group(0)
        if not word.isupper() and not word.islower():
            return word.lower()
        elif word.isupper():
            return word
        else:
            return word
    return re.sub(r'\b[A-Z][a-zA-Z]*\b', replace, text)

print("Sample Content:")
print(df["Content"].head(10))

Sample Content:
0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: Content, dtype: str


In [39]:
df["Content"] = (df["Content"]
    .astype(str)
    .str.strip()
    .str.replace(r'[!?.]{2,}', ' PUNCT ', regex=True)    # Multiple punctuation → PUNCT
    .str.replace(r'[!?.]', '', regex=True)               # Single punctuation → remove
    .str.replace(r'[^\w\s]', ' ', regex=True)             # Remove ALL other special chars
    .str.replace(r'[_\-]', ' ', regex=True)              # Replace - and _ → space
    .str.replace(r'\s+', ' ', regex=True)
    .apply(filter_case)                                  # Filter case
)
print("Sample Content:")
print(df["Content"].head(10))

Sample Content:
0    go until jurong point crazy PUNCT available on...
1                 ok lar PUNCT joking wif u oni PUNCT 
2    free entry in 2 a wkly comp to win FA cup fina...
3    U dun say so early hor PUNCT U c already then ...
4    nah I don t think he goes to usf he lives arou...
5    freemsg hey there darling it s been 3 week s n...
6    even my brother is not like to speak with me t...
7    as per your request melle melle oru minnaminun...
8    WINNER PUNCT as a valued network customer you ...
9    had your mobile 11 months or more U R entitled...
Name: Content, dtype: str


In [40]:
# Clean the labels
df["Label"] = df["Label"].astype(str).str.strip().str.lower()
print("Unique labels:", df["Label"].unique())

Unique labels: <StringArray>
['ham', 'spam']
Length: 2, dtype: str


In [None]:
# Remove any rows where Label or Content is empty/null
df = df.dropna(subset=["Label", "Content"])  

In [None]:
# Save the cleaned dataset to a new CSV file
Path("data").mkdir(exist_ok=True)
df.to_csv("data/spam_processed.csv", index=False)
