# Spam Email Detection for Indian Context

## Step 1: Import Required Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB



## Step 2: Load and Explore Data

In [3]:
spam_df = pd.read_csv("indian_email_dataset_100k.csv")

In [4]:
spam_df

Unnamed: 0,msg,label
0,Congratulations! You won iPhone 15 from Flipka...,spam
1,Can we reschedule to 3:30 instead?,ham
2,Reminder: Doctor appointment tomorrow at 10:30 AM,ham
3,BSNL bill overdue Rs 25000. Pay at 8912345678 ...,spam
4,Lose 10 lakhs kg in 30 days! Ayurvedic medicin...,spam
...,...,...
99995,Cancer cure ayurvedic medicine 8888777766,spam
99996,Performance review at 1:30 on Tuesday,ham
99997,Project deadline extended to next Sunday,ham
99998,Scholarship of Rs 12 lakhs approved. Register ...,spam


In [5]:
# inspect data
spam_df.groupby('label').describe()

Unnamed: 0_level_0,msg,msg,msg,msg
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,49936,4879,Hope you're doing well,329
spam,50064,21865,Your ATM card is blocked. Update KYC immediate...,59


## Step 3: Prepare Data for Training

In [6]:
spam_df['spam']=spam_df['label'].apply(lambda x: 1 if x == 'spam' else 0)

In [7]:
spam_df.head(10)

Unnamed: 0,msg,label,spam
0,Congratulations! You won iPhone 15 from Flipka...,spam,1
1,Can we reschedule to 3:30 instead?,ham,0
2,Reminder: Doctor appointment tomorrow at 10:30 AM,ham,0
3,BSNL bill overdue Rs 25000. Pay at 8912345678 ...,spam,1
4,Lose 10 lakhs kg in 30 days! Ayurvedic medicin...,spam,1
5,Your OTP for 4:30 is Pooja. Valid for 10 minutes.,ham,0
6,The 3:30 workshop starts at Amit tomorrow,ham,0
7,Meeting rescheduled to 3:30 PM tomorrow,ham,0
8,Paytm KYC pending. Complete at 50000 or accoun...,spam,1
9,FREE! Get instant loan upto Rs 5 lakhs without...,spam,1


## Step 4: Split Data into Training and Testing Sets

In [8]:
x_train, x_test, y_train, y_test = train_test_split(spam_df['msg'],spam_df['spam'])

## Step 5: Vectorize Text Data

In [9]:
# Creates a count of unique word
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)

In [10]:
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], shape=(75000, 911))

## Step 6: Train the Model

In [11]:
# train model
model = MultinomialNB()
model.fit(x_train_count,y_train)

## Step 7: Test Model Accuracy

In [12]:
# test model
x_test_count = cv.transform(x_test)
accuracy = model.score(x_test_count,y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 99.70%


## Step 8: Manual Testing Examples
* if prediction is **1** that means **Spam** ‚ö†Ô∏è
* if prediction is **0** that means **Ham** (Not Spam) ‚úÖ

In [13]:
# Test 1: Spam example
email_test = ["Your property tax Rs 2 lakhs overdue. Pay at 8765432109"]
email_test_count = cv.transform(email_test)
prediction = model.predict(email_test_count)[0]
print(f"Email: {email_test[0]}")
print(f"Prediction: {prediction} ({'SPAM' if prediction == 1 else 'HAM'})")

Email: Your property tax Rs 2 lakhs overdue. Pay at 8765432109
Prediction: 1 (SPAM)


In [14]:
# Test 2: Ham example
email_test = ["I HAVE A DATE ON SUNDAY WITH WILL!!"]
email_test_count = cv.transform(email_test)
prediction = model.predict(email_test_count)[0]
print(f"Email: {email_test[0]}")
print(f"Prediction: {prediction} ({'SPAM' if prediction == 1 else 'HAM'})")

Email: I HAVE A DATE ON SUNDAY WITH WILL!!
Prediction: 0 (HAM)


---
## Gradio Interface (Professional & Shareable)

In [15]:
# Uncomment to use Gradio (install first: pip install gradio)
import gradio as gr

def predict_spam(email_text):
    if not email_text.strip():
        return "Please enter an email to check!", 0.0, ""
    
    # Transform and predict
    email_count = cv.transform([email_text])
    prediction = model.predict(email_count)[0]
    probability = model.predict_proba(email_count)[0]
    
    if prediction == 1:
        result = "üö´ SPAM DETECTED!"
        confidence = probability[1] * 100
        explanation = "This email appears to be spam. Be cautious and don't click any links or provide personal information."
    else:
        result = "‚úÖ LEGITIMATE EMAIL (HAM)"
        confidence = probability[0] * 100
        explanation = "This email appears to be legitimate and safe."
    
    return result, confidence, explanation

# Create Gradio interface
demo = gr.Interface(
    fn=predict_spam,
    inputs=gr.Textbox(
        lines=5,
        placeholder="Enter your email text here...",
        label="Email Message"
    ),
    outputs=[
        gr.Textbox(label="Prediction Result"),
        gr.Number(label="Confidence (%)"),
        gr.Textbox(label="Explanation")
    ],
    title="Spam Email Detector",
    description="Enter an email message to check if it's spam or legitimate (ham).",
    examples=[
        ["FREE! Get instant loan upto Rs 1 lakh without documents. Call 8912345678"],
        ["Hey, can we reschedule our meeting to 3:30 instead?"],
        ["Your property tax Rs 2 lakhs overdue. Pay at 8765432109"],
        ["Reminder: Doctor appointment tomorrow at 10:30 AM"]
    ],
    theme=gr.themes.Soft()
)

# Launch the interface
demo.launch(share=False, inline=True)


  super().__init__(


* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.


