In [6]:

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
import os


In [7]:
# Step 1 — Data intake & Basic EDA (tailored approach)

# Set style for visualizations
plt.style.use('ggplot')
sns.set(font_scale=1.2)

# Load the dataset
print("Loading the spam dataset...")
df = pd.read_csv('spam_dataset.csv')
print(f"Dataset loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns.")

# 1) Snapshot & Data Dictionary
print("\n=== DATASET SNAPSHOT ===")
print(df.info())
print(df.head())


print("\n=== DATA DICTIONARY ===")
# Display data types and null counts
data_dict = pd.DataFrame({
    'Type': df.dtypes,
    'Non-Null Count': df.count(),
    'Null Count': df.isnull().sum(),
    'Null %': (df.isnull().sum() / len(df) * 100).round(2),
    'Unique Values': [df[col].nunique() for col in df.columns]
})
print(data_dict)

# Check for 'Unnamed: 0' column (index to drop)
if 'Unnamed: 0' in df.columns:
    print("\nDetected technical index column 'Unnamed: 0' - will be dropped")
    df = df.drop('Unnamed: 0', axis=1)
    print("Column dropped successfully")

# Verify label values
print("\n=== LABEL VERIFICATION ===")
print("'label' column values:")
print(df['label'].value_counts())

# Verify label_num values and consistency with label
print("\n'label_num' column values:")
print(df['label_num'].value_counts())

# Check consistency between label and label_num
if 'label' in df.columns and 'label_num' in df.columns:
    # Create a crosstab to verify alignment
    label_consistency = pd.crosstab(df['label'], df['label_num'], 
                                    rownames=['label'], 
                                    colnames=['label_num'])
    print("\nConsistency check (label vs label_num):")
    print(label_consistency)
    
    # Verify if spam == 1
    spam_is_one = (df[df['label'] == 'spam']['label_num'] == 1).all()
    print(f"Spam corresponds to label_num=1: {spam_is_one}")
    
    if not spam_is_one:
        print("WARNING: Inconsistency between 'label' and 'label_num'!")

# Check for empty or very short texts
print("\n=== TEXT CONTENT VERIFICATION ===")
# Check text length statistics
df['text_length'] = df['text'].str.len()
print("Text length statistics:")
print(df['text_length'].describe())

# Identify very short texts (potential empty content)
short_threshold = 10  # Define threshold for "very short" texts
short_texts = df[df['text_length'] < short_threshold]
print(f"\nFound {len(short_texts)} very short texts (less than {short_threshold} characters):")
if not short_texts.empty:
    print(short_texts[['label', 'text']])

# Check for whitespace-only texts
whitespace_texts = df[df['text'].str.strip() == '']
print(f"\nFound {len(whitespace_texts)} whitespace-only texts:")
if not whitespace_texts.empty:
    print(whitespace_texts[['label', 'text']])

# Verify if texts contain "Subject:" pattern
has_subject = df['text'].str.contains('Subject:', case=False, regex=True)
print(f"\nTexts containing 'Subject:': {has_subject.sum()} out of {len(df)} ({has_subject.mean()*100:.2f}%)")

# Display a few examples of texts with and without "Subject:"
if has_subject.any():
    print("\nExample with 'Subject:':")
    print(df[has_subject].iloc[0]['text'][:200] + "...")
    
if (~has_subject).any():
    print("\nExample without 'Subject:':")
    print(df[~has_subject].iloc[0]['text'][:200] + "...")

print("\n=== PRELIMINARY FINDINGS ===")
print(f"- Total records: {len(df)}")
print(f"- Missing values: {'None' if df.isnull().sum().sum() == 0 else df.isnull().sum().sum()}")
print(f"- Very short texts: {len(short_texts)}")
print(f"- Whitespace-only texts: {len(whitespace_texts)}")
print(f"- Class distribution: {df['label'].value_counts().to_dict()}")
print(f"- Spam corresponds to label_num=1: {spam_is_one if 'label' in df.columns and 'label_num' in df.columns else 'N/A'}")

Loading the spam dataset...
Dataset loaded successfully with 5171 rows and 4 columns.

=== DATASET SNAPSHOT ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB
None
   Unnamed: 0 label                                               text  \
0         605   ham  Subject: enron methanol ; meter # : 988291\nth...   
1        2349   ham  Subject: hpl nom for january 9 , 2001\n( see a...   
2        3624   ham  Subject: neon retreat\nho ho ho , we ' re arou...   
3        4685  spam  Subject: photoshop , windows , office . cheap ...   
4        2030   ham  Subject: re : indian springs\nthis deal is to ...   

   label_num  
0          0  
1         