In [110]:
import pandas as pd
import numpy as np
import re
import joblib
import streamlit as st
import nltk  # Import NLTK
from nltk.corpus import stopwords  # Import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [111]:
df = pd.read_csv('spam.csv', encoding='latin-1')

In [112]:
df

Unnamed: 0,v1,v2,v3,v4,v5
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [113]:
df.drop(columns=['v3', 'v4', 'v5'], inplace=True)
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [114]:

# Data preprocessing
df['v2'] = data['v2'].astype(str)  # Convert to string
df['v2'] = data['v2'].apply(lambda x: re.sub(r'<.*?>', '', x))  # Remove HTML tags
df['v2'] = data['v2'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))  # Remove special characters
df['v2'] = data['v2'].str.lower()  # Convert text to lowercase


In [115]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [116]:
def tokenize_and_remove_stopwords(text):
    if isinstance(text, list):  
        text = ' '.join(text)  
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

df['v2'] = df['v2'].apply(tokenize_and_remove_stopwords)
df['v2']

0       go jurong point , crazy .. available bugis n g...
1                         ok lar ... joking wif u oni ...
2       free entry 2 wkly comp win fa cup final tkts 2...
3             u dun say early hor ... u c already say ...
4            nah n't think goes usf , lives around though
                              ...                        
5567    2nd time tried 2 contact u. u å£750 pound priz...
5568                       ì_ b going esplanade fr home ?
5569                    pity , * mood . ... suggestions ?
5570    guy bitching acted like 'd interested buying s...
5571                                     rofl . true name
Name: v2, Length: 5572, dtype: object

In [117]:
label_encoder = LabelEncoder()
df['v1'] = label_encoder.fit_transform(df['v1'])


In [118]:
# Split the data into training and testing sets
X = df['v2']
y = df['v1']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [119]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=8000) 

In [120]:
# Transform the email text into TF-IDF feature vectors
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [121]:
from sklearn.ensemble import RandomForestClassifier
#initialize random forest
random_forest_model = RandomForestClassifier()
#train random forest
random_forest_model.fit(X_train_tfidf, y_train)



In [122]:
# Save the TF-IDF vectorizer and the trained model
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(random_forest_model, 'spam_model.pkl')


['spam_model.pkl']

In [123]:
# Streamlit app for email classification
st.title("Email Spam Detection App")


email_text = st.text_area("Enter an email:")

In [124]:
# Create a function to classify the email
def classify_email(email_text):
    # Preprocess the input email text
    email_text = re.sub(r'<.*?>', '', email_text)
    email_text = re.sub(r'[^a-zA-Z0-9\s]', '', email_text)
    email_text = email_text.lower()
    email_text = tokenize_and_remove_stopwords(email_text)

    # Transform the email text using the saved TF-IDF vectorizer
    email_text_tfidf = tfidf_vectorizer.transform([email_text])

    # Predict if the email is spam or not
    prediction = random_forest_model.predict(email_text_tfidf)

    return prediction

In [125]:
# Add a button to classify the email
if st.button("Classify"):
    if email_text:
        prediction = classify_email(email_text)
        if prediction == 1:
            st.write("This is a spam email.")
        else:
            st.write("This is not a spam email.")
    else:
        st.write("Please enter an email.")


In [129]:
df['v2'][3]

'u dun say early hor ... u c already say ...'

In [128]:
!streamlit run - test.py



Usage: streamlit run [OPTIONS] TARGET [ARGS]...
Try 'streamlit run --help' for help.

Error: Streamlit requires raw Python (.py) files, but the provided file has no extension.
For more information, please see https://docs.streamlit.io
