In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pickle
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.39.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<6,>=2.1.5 (from streamlit)
  Downloading watchdog-5.0.3-py3-none-manylinux2014_x86_64.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading streamlit-1.39.0-py2.py3-none-any.whl (8.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [3

In [3]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Spam_mail_detection/mail_data.csv')

In [4]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [8]:
data.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [9]:
mail_data = data.where(pd.notnull(data), '')
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
data.shape

(5572, 2)

Label Encoding

In [14]:
#spam mail = 0
#ham mail = 1
mail_data.loc[mail_data['Category'] == 'spam', 'Category'] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category'] = 1

In [15]:
x = mail_data['Message']
y = mail_data['Category']

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

Transforming the text data into feature vectors that can be used as input to the logistic regression

In [21]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
# min_df = repitition(we wrote 1 which means we want to take only the words which are repeated more than 1 time)

x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

#converting into integers
y_train = y_train.astype('int')
y_test = y_test.astype('int')

Traning the Model

In [22]:
model = LogisticRegression()

In [35]:
trained_model = model.fit(x_train_features, y_train)

**Evaluating the model**

In [36]:
#predicting on training data
training_prediction = model.predict(x_train_features)
training_accuracy = accuracy_score(y_train, training_prediction)
print("Accuracy on training data:- ", training_accuracy)

Accuracy on training data:-  0.9685887368184878


In [37]:
#predicting on testing data
testing_prediction = model.predict(x_test_features)
testing_accuracy = accuracy_score(y_test, testing_prediction)
print("Accuracy on testing data:- ", testing_accuracy)

Accuracy on testing data:-  0.9533632286995516


In [38]:
#building a predictive system

#Ham mail
input_mail_1 = ["Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."]

#Spam mail
input_mail_2 = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]

#converting text to feature vectors
input_data_extraction = feature_extraction.transform(input_mail_2)

#prediction
prediction = model.predict(input_data_extraction)

if prediction[0] == 1:
  print("Ham Mail")
else:
  print("Spam Mail")

Spam Mail


In [39]:
pickle.dump(trained_model, open('Spam_mail_detection.sav', 'wb'))

In [72]:
pickle.dump(feature_extraction, open('vectorizer.sav', 'wb'))

In [76]:
%%writefile app.py
import streamlit as st
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pickle


#Title
st.title('Spam Mail Detector')

#loading Model
model = pickle.load(open('/content/drive/MyDrive/Colab Notebooks/Spam_mail_detection/Spam_mail_detection.sav', 'rb'))
vectorizer = pickle.load(open('/content/drive/MyDrive/Colab Notebooks/Spam_mail_detection/vectorizer.sav', 'rb'))

#Input
Mail = st.text_input(
    'Enter the Mail:- ',
    ""
)

def prediction(Mail):
  #feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
  #x_train_features = feature_extraction.fit_transform(x_train)
  input_data_extraction = vectorizer.transform([Mail])
  prediction = model.predict(input_data_extraction)
  if prediction[0] == 1:
    return "Ham Mail"
  else:
    return "Spam Mail"


if st.button('Predict'):
  result = prediction(Mail)
  st.success(result)

Overwriting app.py


In [77]:
!wget -q -O - ipv4.icanhazip.com # copy the code below and paste it in the 'your url is:' generated in the below cell

34.148.30.210


In [None]:
! streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.148.30.210:8501[0m
[0m
your url is: https://khaki-bushes-wish.loca.lt
