# Email Spam Classifier

## Step 1: Import Libraries and Load Data

In [2]:
# Importing necessary libraries for data analysis and machine learning
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Loading the dataset directly from the URL
url = "https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv"
df = pd.read_csv(url, encoding='latin-1')

# Displaying the first few rows to check if data is loaded correctly
print("Dataset loaded successfully.")
df.head(10)

Dataset loaded successfully.


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


## Step 2: Data Preprocessing

In [3]:
# Dropping unnecessary columns that contain null values
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

# Renaming columns to meaningful names
df.columns = ['Category', 'Message']

# Encoding the 'Category' column: Spam becomes 1, Ham becomes 0
df['Spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

# Verifying the changes
print("Data cleaning completed.")
df.head()

Data cleaning completed.


Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


## Step 3: Split Data into Training and Testing Sets

In [4]:
# Defining features (X) and target (y)
X = df.Message
y = df.Spam

# Splitting the dataset: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Total emails: {len(df)}")
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Total emails: 5572
Training set size: 4457
Testing set size: 1115


## Step 4: Feature Extraction

In [5]:
# Initializing CountVectorizer to convert text into numerical data
v = CountVectorizer()

# Fitting the vectorizer on the training data and transforming it
X_train_count = v.fit_transform(X_train.values)

print("Text data has been converted to numerical features.")

Text data has been converted to numerical features.


## Step 5: Model Training

In [6]:
# Initializing the Multinomial Naive Bayes model
model = MultinomialNB()

# Training the model using the training data
model.fit(X_train_count, y_train)

print("Model training completed successfully.")

Model training completed successfully.


## Step 6: Model Evaluation

In [7]:
# Transforming the test data using the same vectorizer
X_test_count = v.transform(X_test)

# Making predictions on the test set
y_pred = model.predict(X_test_count)

# Calculating and printing the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Displaying a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 98.39%

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.94       150

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



## Step 7: Real-time Prediction System

In [None]:
def predict_message(message_text):
    # Convert the input message into numbers
    message_count = v.transform([message_text])

    # Predict using the trained model
    prediction = model.predict(message_count)

    # Return the result based on the prediction
    if prediction[0] == 1:
        return "This is a SPAM message."
    else:
        return "This is a HAM (Safe) message."

# Testing with some examples
print(predict_message("Congratulations! You have won a $1000 gift card. Click here to claim."))
print(predict_message("Hey, are we still meeting for lunch tomorrow?"))

This is a SPAM message.
This is a HAM (Safe) message.
