In [None]:
import pandas as pd 
import numpy as np 

In [None]:
# import data 
df = pd.read_csv('../data/raw/spam.csv')
print("Data imported successfully.")

In [None]:
print(f"Shape of the data: {df.shape}")
df.head()
print(df.info())
print(df.describe())

In [None]:
# Lowercase all text
df['Message'] = df['Message'].str.lower()

# Remove punctuation
df['Message'] = df['Message'].str.replace(r'\W', ' ', regex=True)

#  Remove numbers
df['Message'] = df['Message'].str.replace(r'\d', '', regex=True)

#  Strip extra spaces
df['Message'] = df['Message'].str.strip()


print(df.head())


In [None]:
# Check for duplicate rows
print("Number of duplicate rows:", df.duplicated().sum())

print("Shape before  removing duplicates:", df.shape)

# Remove duplicate rows
df = df.drop_duplicates()

print("Shape after removing duplicates:", df.shape)


In [None]:
print("spam count vs ham count:")
print(df["Category"].value_counts())

In [None]:
# Save processed dataset
df.to_csv('../data/processed/spam_processed.csv', index=False)

In [None]:
# feature extraction
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.data_preprocessing import load_data, preprocess_text
from src.feature_extraction import get_bow_features
from sklearn.model_selection import train_test_split


# Load dataset
df = preprocess_text(df)

#  Split into train/test
X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['Category'], test_size=0.2, random_state=42)

# Convert text to Bag of Words features
X_train_vec, X_test_vec, vectorizer = get_bow_features(X_train, X_test)

print("Feature extraction completed.")
print("Number of features:", X_train_vec.shape[1])


In [None]:
import sys
import os

# add repo root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.predict import load_model_and_vectorizer, predict_email

# Load model and vectorizer
model, vectorizer = load_model_and_vectorizer()

# Predict new emails
emails = [
    "Congratulations! You've won a free iPhone. Click here!",
    "Hi John, please find attached the report for last week."
]

for email in emails:
    result = predict_email(email, model, vectorizer)
    print(f"Email: {email}\nPrediction: {result}\n")
