# **EMAIL SPAM DETECTION USING MACHINE LEARNING**

In [None]:
#import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:
#load dataset
df = pd.read_csv('spam.csv', encoding = 'latin-1')

In [None]:
#data preprocessing- rename columns
df = df[['v1', 'v2']]
df.columns = ['label','info']

In [None]:
#feature extraction- text into numerical vectors
tvect = TfidfVectorizer(stop_words = 'english')
X = tvect.fit_transform(df['info'])
y = (df['label'] == 'spam').astype(int)

In [None]:
#model training- Naive Bayes
nb = MultinomialNB()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
nb.fit(X_train,y_train)

In [None]:
#predicting spam or not
y_pred = nb.predict(X_test)

In [None]:
#performance test- accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)
print('Classification Report(f1 score, precision, recall and support):')
print(classification_report(y_test, y_pred))

Accuracy: 0.968609865470852
Classification Report(f1 score, precision, recall and support):
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.93      1115
weighted avg       0.97      0.97      0.97      1115



## CONCLUSION

Hence, we have used machine learning (Naive Bayes) to train the spam detector to recognize and classify emails into spam and non-spam.
