# Importing the Required Libraries and Dataset

### Importing the Required Libraries

In [1]:
import pandas as pd
import numpy as np

### Loading the Dataset

In [2]:
data=pd.read_csv("email.csv")

# Data Pre-Processing

### Displaying the First Five Rows of the Dataset

In [3]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Getting the Info of the Dataset

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5573 entries, 0 to 5572
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5573 non-null   object
 1   Message   5573 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


### Getting the Overall Statistics of the Dataset

In [5]:
data.describe()

Unnamed: 0,Category,Message
count,5573,5573
unique,3,5158
top,ham,"Sorry, I'll call later"
freq,4825,30


### Checking for the Missing Values

In [6]:
data.isnull().sum()

Category    0
Message     0
dtype: int64

### Checking for the Duplicate Values

In [7]:
data.duplicated().any()

True

In [8]:
data=data.drop_duplicates()

In [9]:
data.duplicated().any()

False

### Converting the Categorical Column to Numerical Column

In [10]:
data["Category"].unique()

array(['ham', 'spam', '{"mode":"full"'], dtype=object)

In [11]:
data["Category"] = data["Category"].replace("spam", 0)
data["Category"] = data["Category"].replace("ham", 1)

In [12]:
data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
print(data["Category"].unique())

[1 0 '{"mode":"full"']


In [14]:
data = data[data["Category"] != '{"mode":"full"']

In [15]:
print(data["Category"].unique())

[1 0]


In [16]:
data["Category"]=data["Category"].astype(int)

### Separating the Dependent and Independent Variables

In [17]:
x=data["Message"]
y=data["Category"]

### Feature Extraction For the "Message" Column

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

In [20]:
feature_extraction.fit(x)
x=feature_extraction.transform(x)

### Splitting the Dataset to Training & Testing Dataset

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=3)

### Training the Model(Naive Bayes Classifier) 

In [23]:
from sklearn.naive_bayes import MultinomialNB
classifier_NB=MultinomialNB()
classifier_NB.fit(x_train,y_train)

In [24]:
classifier_NB_predictions_on_testing_data=classifier_NB.predict(x_test)

### Training the Model(Logistic Regression)

In [25]:
from sklearn.linear_model import LogisticRegression
classifier_LR=LogisticRegression()
classifier_LR.fit(x_train,y_train)

In [26]:
classifier_LR_predictions_on_testing_data=classifier_LR.predict(x_test)

### Comparing the Accuracy Scores of the Models

In [27]:
from sklearn.metrics import accuracy_score

In [28]:
accuracy_score_of_classifier_NB=accuracy_score(y_test,classifier_NB_predictions_on_testing_data)

In [29]:
accuracy_score_of_classifier_LR=accuracy_score(y_test,classifier_LR_predictions_on_testing_data)

In [30]:
print("Accuracy Score Of Naive Bayes Classifier:",accuracy_score_of_classifier_NB)
print("Accuracy Score Of Logistic Regression:",accuracy_score_of_classifier_LR)

Accuracy Score Of Naive Bayes Classifier: 0.9554263565891473
Accuracy Score Of Logistic Regression: 0.9341085271317829


In [31]:
### Predicting the Mail Category

In [32]:
Input_Your_Mail=["We are excited to inform you that you have been selected as the lucky winner of a $1,000,000 cash prize in our annual lottery! This is a once-in-a-lifetime opportunity, and we are thrilled to share this incredible news with you."]

In [33]:
input_mail_feature=feature_extraction.transform(Input_Your_Mail)

In [34]:
prediction=classifier_NB.predict(input_mail_feature)

In [35]:
if prediction==0:
    print("Alert! This mail is a Spam")
else:
    print("The mail is Harmless")

Alert! This mail is a Spam


### ---The Project Completed Assigned By the Ezitech Institute---