In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer #to covert text data to numeric data
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score #to evaluate the model


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#load data from csv file using pandas
raw_data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/spam mail detector/mail_data.csv')

In [4]:
print(raw_data)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [5]:
#replace null values with null string(missing values=null data)
#mail_data=raw_data.where((pd.notnull(raw_data)),'')
#pd.notnull(raw_data):
#Checks where the values in raw_data are not null (True for valid values, False for NaNs).
#raw_data.where(...):
#Keeps values where the condition is True (i.e., not null).
#For values where the condition is False (i.e., NaN), it replaces them with the second argument — here, an empty string
mail_data = raw_data.fillna('')#(more clear and concise)

In [6]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


LABEL ENCODING-SPAM=0,HAM=1

In [7]:
mail_data.loc[mail_data['Category']=='spam','Category']=0  #This line replaces the value 'spam' with 0 in the 'Category' column of the DataFrame mail_data.
mail_data.loc[mail_data['Category']=='ham','Category']=1

In [8]:
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


Separating text and data input data as x and output as y

In [9]:
x=mail_data['Message']
y=mail_data['Category']

In [10]:
print(x)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


TRAIN TEST SPLIT (80% training and 20% testing)

In [11]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=3)

FEATURE EXTRACTION CONVERT TEXT TO NUMERIC DATA

In [12]:
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
#min_df=1:Ignores terms that appear in fewer than 1 document (effectively includes all terms).
#stop_words='english':Removes common English stop words like "the", "is", "and", etc.
#lowercase=True:Converts all text to lowercase before tokenizing.

x_train_feature=feature_extraction.fit_transform(x_train)
x_test_feature=feature_extraction.transform(x_test)

#fit_transform(x_train) → "spam", "message", "important", "mail" jaise words ko numbers mein badal deta hai.
#transform(x_test) → sirf wahi words convert karta hai jo pehle seekhe gaye the ("spam" aur "offer" — agar "offer" nahi seekha to usse ignore karega).
# Vocabulary banana + number mein badalna -	fit_transform(x_train)
#Sirf number mein badalna (jo vocab pehle ban chuki hai) -	transform(x_test)

y_train=y_train.astype('int')
y_test=y_test.astype('int')
#labels (y_train aur y_test) ko integers mein convert karti hai.

TRAINING THE ML MODEL

In [13]:
model=LogisticRegression()

In [14]:
model.fit(x_train_feature,y_train)

evaluating the trained model

In [15]:
#prediction on training data
pre_train_data=model.predict(x_train_feature)
accuracy_train_data=accuracy_score(y_train,pre_train_data)

In [16]:
print("accuracy of training data : ",accuracy_train_data)

accuracy of training data :  0.9676912721561588


In [17]:
pre_test_data=model.predict(x_test_feature)
accuracy_test_data=accuracy_score(y_test,pre_test_data)

In [18]:
print("Test accuracy : ",accuracy_test_data)

Test accuracy :  0.9668161434977578


building a predictive system

In [19]:
input_mail=["HII my name is Jiya and it was nice meeting you joey"]
#convert it to feature vector
input_data_features=feature_extraction.transform(input_mail)
#make prediction
prediction=model.predict(input_data_features)
print(prediction)
if(prediction[0]==1):  #prediction[o]=1st value in list
    print("HAM")
else:
    print("SPAM")

[1]
HAM


In [20]:
!pip install --quiet streamlit pyngrok scikit-learn joblib
!pip install --quiet gradio scikit-learn joblib


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m76.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m92.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [21]:
import joblib

# Save the trained model
joblib.dump(model, 'spam_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(feature_extraction, 'vectorizer.pkl')


['vectorizer.pkl']

In [22]:
import gradio as gr
import joblib

# Load your saved model and vectorizer
model = joblib.load("spam_model.pkl")
vectorizer = joblib.load("vectorizer.pkl")

# Prediction function
def predict_spam(text):
    features = vectorizer.transform([text])
    result = model.predict(features)[0]
    return "✅ HAM (Not Spam)" if result == 1 else "🚫 SPAM"

# Gradio interface
demo = gr.Interface(
    fn=predict_spam,
    inputs=gr.Textbox(lines=5, label="Enter Email Text"),
    outputs="text",
    title="📧 Spam Mail Detector",
    description="This app classifies your email message as SPAM or HAM (Not Spam)."
)

demo.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b368a88e2e7d23033e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


