In [None]:
# install dependencies
%pip install -r ../requirements.txt

In [None]:
# import libraries
import os
import pandas as pd

In [None]:
# import csv data
path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'spam_processed.csv')
df = pd.read_csv(path)
print("Data imported successfully.")

df.head()

In [None]:
# check the shape of the data
print(f"Shape of the data: {df.shape}")

print("-" * 50)

# check for missing values
print(df.info())

print("-" * 50)

# check for class / category distribution
print(df.describe())

In [None]:
# Check for duplicate rows
print("Number of duplicate rows:", df.duplicated().sum())

print("Shape before  removing duplicates:", df.shape)

# Remove duplicate rows
df = df.drop_duplicates()

print("Shape after removing duplicates:", df.shape)

In [None]:
# first of all we will do some basic text preprocessing to clean the textual data 
# and make it suitable for feature extraction and model training. 
# This will include steps like:

# Lowercase all text
df['Message'] = df['Message'].str.lower()

# Remove punctuation
df['Message'] = df['Message'].str.replace(r'\W', ' ', regex=True)

#  Remove numbers
df['Message'] = df['Message'].str.replace(r'\d', '', regex=True)

#  Strip extra spaces
df['Message'] = df['Message'].str.strip()



#  note we have removed punctuation and numbers from the text, which can help reduce noise in the data.
# also it will also help to filter duplicate message which only differ in case, punctuation or numbers.



In [None]:
# again check for duplicates after text preprocessing
print("Number of duplicate rows after text preprocessing:", df.duplicated().sum())
print("Shape before removing duplicates:", df.shape)
df.drop_duplicates(inplace=True)
print("Shape after removing duplicates:", df.shape)



In [None]:
# check preprocessed data
print(df.head())
print("-" * 50)
print("spam count vs ham count:")
print(df["Category"].value_counts())

In [None]:
# Save processed dataset
df.to_csv('../data/spam_processed.csv', index=False)

In [None]:
# load preprocessed data for next steps
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.data_preprocessing import load_data

processed_csv_path = os.path.join((os.path.dirname(os.getcwd())), 'data', 'spam_processed.csv') 
#  processed_csv_path  = "../data/spam_processed.csv"

df = load_data(processed_csv_path)


In [None]:
# train a model using the preprocessed data
from train import train_model

train_model(processed_csv_path)

In [None]:
# predict new emails using the trained model
import sys
import os

# add repo root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.model import load_model_and_vectorizer, predict_email

# Load model and vectorizer
model_name = "spam_classifier.pkl"
vectorizer_name = "vectorizer.pkl"
model, vectorizer = load_model_and_vectorizer(model_filename=model_name, vectorizer_filename=vectorizer_name)

# Predict new emails
emails = [
    "Meeting rescheduled to 3 PM tomorrow. Please confirm your availability.",
    "You have won a free lottery! Click here to claim your prize.",
]

for email in emails:
    result = predict_email(email, model, vectorizer)
    print(f"Email: {email}\nPrediction: {result}\n")
