<a href="https://colab.research.google.com/github/NeSma237/-Email_Spam_Detecto/blob/main/Email_Spam_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install streamlit



In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score ,accuracy_score
import streamlit as st

In [None]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
# Define function to clean message text
def clean_message(message):
    message = str(message).lower()
    message = re.sub(f"[{re.escape(string.punctuation)}]", "", message)
    message = re.sub(r"\d+", "", message)
    message = re.sub(r"\s+", " ", message).strip()
    return message

In [None]:
# Define function to preprocess message text
def preprocess_message(message):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    message = clean_message(message)
    tokens = message.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return " ".join(tokens)

In [None]:
# Define function to load data from CSV file
def load_data(path):
    data = pd.read_csv(path)
    data.dropna(subset=['Message'], inplace=True)
    return data

In [None]:
# Define function to preprocess the dataset
def preprocess_dataset(data):
    data['clean_message'] = data['Message'].apply(preprocess_message)
    return data

In [None]:
# Define function to vectorize messages using TF-IDF
def vectorize_message(data):
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(data['clean_message'])
    return X, data['Spam/Ham'], vectorizer


In [None]:
# Define function to save the cleaned data to a CSV file
def save_clean_data(X, y, output_path):
    data_clean = pd.DataFrame(X.toarray())
    data_clean['Spam/Ham'] = y.values
    data_clean.to_csv(output_path, index=False)

In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
# Main execution block
if __name__ == "__main__":
    # Define input and output file paths
    input_path = "enron_spam_data.csv"
    output_path = r"processed_data.csv"

    # Load data
    data = load_data(input_path)

    # Preprocess, vectorize, and save the data
    data = preprocess_dataset(data)
    X, y, vectorizer = vectorize_message(data)
    save_clean_data(X, y, output_path)

    print("Data preprocessing completed. Saved to:", output_path)

Data preprocessing completed. Saved to: processed_data.csv


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train a Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Make predictions and evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
f1 = f1_score(y_test, y_pred, pos_label='spam')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.9850052481631429
Precision: 0.9801186943620178
Recall: 0.9901079136690647
F1-score: 0.9850879809126155


In [None]:
import joblib
joblib.dump(model, "spam_classifier_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']

In [None]:
with open("app.py", "w") as f:
    f.write("""<انسخ الكود اللي فوق هنا>""")


In [None]:
with open("requirements.txt", "w") as f:
    f.write("streamlit\nscikit-learn\npandas\nnltk\njoblib")


In [None]:
from google.colab import files
files.download("spam_classifier_model.pkl")
files.download("tfidf_vectorizer.pkl")
files.download("app.py")
files.download("requirements.txt")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import gzip
import shutil

with open("spam_classifier_model.pkl", 'rb') as f_in:
    with gzip.open("spam_classifier_model.pkl.gz", 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)


In [None]:
from google.colab import files
files.download("spam_classifier_model.pkl.gz")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>