In [26]:
import re
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
import nltk
import joblib
import seaborn as sns
import matplotlib.pyplot as plt

In [16]:
data = pd.read_csv('emails.csv')

In [17]:
# Ensure you have downloaded the stopwords
stop_words = set(stopwords.words('english'))

In [18]:
# Function to clean the text
def clean_text(text):
    # Remove symbols and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove stop words
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

In [19]:
# Sample data without Droping Rows with NAN Values
print(len(data))

# Drop rows with NaN values in the text column
data = data.dropna(subset=['text'])

# Sample data After Droping Rows with NAN Values
print(len(data))

5728
5728


In [20]:
# Clean the 'text' column
data['text'] = data['text'].apply(clean_text)

In [21]:
data.to_csv("cleaned_dataset.csv", index=False)

In [22]:
data = pd.read_csv('cleaned_dataset.csv')
y_train = pd.DataFrame()
y_test = pd.DataFrame()
for z in range(2):
    X = data[data["spam"].isin([z])]
    X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
    y_train = pd.concat([y_train,X_train], ignore_index=True)
    y_test = pd.concat([y_test,X_test], ignore_index=True)
X_train = y_train['text']
X_test = y_test['text']
y_train = y_train['spam']
y_test = y_test['spam']

# Feature extraction using TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Check the shapes to ensure correct splits and transformations
print("X_train shape:", X_train_tfidf.shape)
print("X_test shape:", X_test_tfidf.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
# y_test
# print(X_train_tfidf)

X_train shape: (4582, 5000)
X_test shape: (1146, 5000)
y_train shape: (4582,)
y_test shape: (1146,)


In [23]:
# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer on the training data
vectorizer.fit(X_train)

# Transform the training data
X_train_tfidf = vectorizer.transform(X_train)

# Convert the TF-IDF sparse matrix to a DataFrame
X_train_tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

print("\nTF-IDF Features DataFrame:")
print(X_train_tfidf_df.head())


TF-IDF Features DataFrame:
    aa   ab  abacus  abilities  ability      able  abroad  absence  \
0  0.0  0.0     0.0        0.0      0.0  0.029077     0.0      0.0   
1  0.0  0.0     0.0        0.0      0.0  0.000000     0.0      0.0   
2  0.0  0.0     0.0        0.0      0.0  0.062562     0.0      0.0   
3  0.0  0.0     0.0        0.0      0.0  0.000000     0.0      0.0   
4  0.0  0.0     0.0        0.0      0.0  0.000000     0.0      0.0   

   absolutely  abstract  ...  zadorozhny   ze      zero  zhang  zhendong  \
0         0.0       0.0  ...         0.0  0.0  0.000000    0.0       0.0   
1         0.0       0.0  ...         0.0  0.0  0.031637    0.0       0.0   
2         0.0       0.0  ...         0.0  0.0  0.000000    0.0       0.0   
3         0.0       0.0  ...         0.0  0.0  0.000000    0.0       0.0   
4         0.0       0.0  ...         0.0  0.0  0.000000    0.0       0.0   

   zimin  zip  ziplip  zipter  zone  
0    0.0  0.0     0.0     0.0   0.0  
1    0.0  0.0     

In [24]:
# Transform the training data
X_test_tfidf = vectorizer.transform(X_test)

# Convert the TF-IDF sparse matrix to a DataFrame
X_test_tfidf_df = pd.DataFrame(X_test_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

print("\nTF-IDF Features DataFrame:")
print(X_test_tfidf_df.head())

#to do Parameter tuninng we are not clear how many features are optimal 


TF-IDF Features DataFrame:
    aa   ab  abacus  abilities  ability  able  abroad  absence  absolutely  \
0  0.0  0.0     0.0        0.0      0.0   0.0     0.0      0.0         0.0   
1  0.0  0.0     0.0        0.0      0.0   0.0     0.0      0.0         0.0   
2  0.0  0.0     0.0        0.0      0.0   0.0     0.0      0.0         0.0   
3  0.0  0.0     0.0        0.0      0.0   0.0     0.0      0.0         0.0   
4  0.0  0.0     0.0        0.0      0.0   0.0     0.0      0.0         0.0   

   abstract  ...  zadorozhny   ze  zero  zhang  zhendong  zimin  zip  ziplip  \
0       0.0  ...         0.0  0.0   0.0    0.0       0.0    0.0  0.0     0.0   
1       0.0  ...         0.0  0.0   0.0    0.0       0.0    0.0  0.0     0.0   
2       0.0  ...         0.0  0.0   0.0    0.0       0.0    0.0  0.0     0.0   
3       0.0  ...         0.0  0.0   0.0    0.0       0.0    0.0  0.0     0.0   
4       0.0  ...         0.0  0.0   0.0    0.0       0.0    0.0  0.0     0.0   

   zipter  zone  
0   

In [27]:
# Training the model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

In [28]:
# Save the model to disk
joblib.dump(model, 'naive_bayes_model.pkl')

# Save the vectorizer to disk
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [29]:
# Load the model from disk
loaded_model = joblib.load('naive_bayes_model.pkl')

# Load the vectorizer from disk
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Transform the test data using the loaded vectorizer
X_test_tfidf_loaded = loaded_vectorizer.transform(X_test)

# Evaluate the loaded model
y_pred_loaded = loaded_model.predict(X_test_tfidf_loaded)
accuracy_loaded = accuracy_score(y_test, y_pred_loaded)
report_loaded = classification_report(y_test, y_pred_loaded)

print(f"Accuracy: {accuracy_loaded}")
print("Classification Report:")
print(report_loaded)

Accuracy: 0.9904013961605584
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       872
           1       0.99      0.97      0.98       274

    accuracy                           0.99      1146
   macro avg       0.99      0.98      0.99      1146
weighted avg       0.99      0.99      0.99      1146



In [30]:
# Take user input
user_input = input("Please enter your email: ").strip()

# Preprocess the user input
def preprocess_user_input(text):
    stop_words = set(stopwords.words('english'))
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

cleaned_input = preprocess_user_input(user_input)

In [31]:
# Load the vectorizer and the model (ensure these are the same as used during training)
vectorizer = joblib.load('tfidf_vectorizer.pkl')
model = joblib.load('naive_bayes_model.pkl')

# Transform the cleaned input using the vectorizer
X_test_tfidf = vectorizer.transform([cleaned_input])  # Wrap the cleaned input in a list

# Convert the TF-IDF sparse matrix to a DataFrame
X_test_tfidf_df = pd.DataFrame(X_test_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

print("\nTF-IDF Features DataFrame for User Input:")
print(X_test_tfidf_df)

# Transform the user input using the loaded vectorizer
# user_input_tfidf = vectorizer.transform([cleaned_input])


TF-IDF Features DataFrame for User Input:
    aa   ab  abacus  abilities  ability  able  abroad  absence  absolutely  \
0  0.0  0.0     0.0        0.0      0.0   0.0     0.0      0.0         0.0   

   abstract  ...  zadorozhny   ze  zero  zhang  zhendong  zimin  zip  ziplip  \
0       0.0  ...         0.0  0.0   0.0    0.0       0.0    0.0  0.0     0.0   

   zipter  zone  
0     0.0   0.0  

[1 rows x 5000 columns]


In [34]:
# Predict the sentiment of the user input
user_prediction = model.predict(X_test_tfidf_df)

if user_prediction[0]==1:
    user_prediction= 'spam'
else:
    user_prediction='not spam'

# Output the prediction
print(f"The email  is: {user_prediction}")

The email  is: not spam


