# SMS Spam Classification Analysis
This notebook demonstrates how to load and analyze SMS spam classification data using Python.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Read CSV File with Pandas
Loading the SMS spam dataset and examining its basic structure.

In [None]:
# Read the CSV file
df = pd.read_csv('spam.csv', encoding='latin-1')
print("Dataset loaded successfully!")

## Basic Data Exploration
Let's examine the basic characteristics of our dataset.

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
display(df.head())

# Get dataset shape
print("\nDataset Shape:", df.shape)

# Display basic information about the dataset
print("\nDataset Info:")
df.info()

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

## Cleaning Text Data
Clean the text messages by removing special characters and converting to lowercase.

In [None]:
import re

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Apply text cleaning to the message column
df['cleaned_message'] = df['message'].apply(clean_text)

# Display some cleaned messages
print("Original vs Cleaned Messages:")
display(pd.DataFrame({
    'Original': df['message'][:5],
    'Cleaned': df['cleaned_message'][:5]
}))

## Filter Dataset
Analyze the characteristics of spam vs ham messages.

In [None]:
# Count of spam vs ham messages
spam_counts = df['label'].value_counts()
print("Distribution of messages:")
print(spam_counts)

# Calculate message length
df['message_length'] = df['cleaned_message'].str.len()

# Compare lengths of spam vs ham messages
plt.figure(figsize=(10, 6))
sns.boxplot(x='label', y='message_length', data=df)
plt.title('Message Length Distribution: Spam vs Ham')
plt.show()

# Basic statistics of message lengths by category
print("\nMessage length statistics by category:")
print(df.groupby('label')['message_length'].describe())