#**Exercise 1 - Implementation of Naive Bayes Algorithm**

#*A.1. Naive Bayes Classifier for Email Classification.*

**Step -1- Load the Data:**

In [20]:
import pandas as pd
df = pd.read_csv("/content/spam_ham_dataset.csv", index_col=0, engine='python')
length = len(df['text'])

**Step -2- Text Cleaning with NLTK Library:**

In [22]:
import re
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
# contain list of words that will be used for training -> final words after cleaning

ps = PorterStemmer()
all_stopwords = stopwords.words('english')

for i in range(0, length):
    # re is used to remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', df['text'].iloc[i])  # replaces the punctuation with space

    # converting to lowercase
    text = text.lower()

    # stemming
    text = text.split()
    text = [ps.stem(word) for word in text if not word in set(all_stopwords)]

    text = ' '.join(text)
    corpus.append(text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Step -3- Re-verifying and further cleaning of the Data:**

In [25]:
data_check = df.copy()
data_check['cleanText'] = corpus
data_check.head()
data_check['cleanText']=data_check['cleanText'].str.replace('subject', '')
data_check.head()

Unnamed: 0,label,text,label_num,cleanText
605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,enron methanol meter follow note gave monday ...
2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,hpl nom januari see attach file hplnol xl hpl...
3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,neon retreat ho ho ho around wonder time year...
4685,spam,"Subject: photoshop , windows , office . cheap ...",1,photoshop window offic cheap main trend abas ...
2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,indian spring deal book teco pvr revenu under...


**Step -4- Construction Feature Matrix and Label Vector:**

In [27]:
x=data_check.loc[:,'cleanText'].values
y=data_check.loc[:,'label_num'].values

**Step -5- Text Representation using Count Vectorization:**

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x = cv.fit_transform(x).toarray()

**Step -6- Final Model Building using Scikit Learn:**

In [29]:
# Necessary Import
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

**Step -7- Final Evaluation:**

In [31]:
model = MultinomialNB() # Initialize the model
model.fit(X_train, y_train) # Train the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9755154639175257
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      1121
           1       0.95      0.96      0.96       431

    accuracy                           0.98      1552
   macro avg       0.97      0.97      0.97      1552
weighted avg       0.98      0.98      0.98      1552

