In [1]:
# imports 

In [118]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

In [41]:
# reading the data file

In [42]:
df = pd.read_csv('/kaggle/input/spam-email/spam.csv')
df.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [43]:
df.shape

(5572, 2)


## Data Preprocessing or Cleaning

In [154]:
# using Stemming 
ps = PorterStemmer()
corpus = []

In [155]:
for i in range(len(df['Message'])):
    words = re.sub('[^a-zA-z]', ' ', df['Message'][i]) # removing all special characters
    words = words.lower()  # lowering all characters
    words = words.split()  # spliting the sentence into words
    # applying stemming using PortStemmer
    words = [ps.stem(word) for word in words if word not in stopwords.words('english')]  
    words = ' '.join(words)
    corpus.append(words)  # appending it to a list

In [157]:
#  using CountVectorizer to convert the message(string) into numerical values 

In [158]:
cv = CountVectorizer()   # bag of words

In [159]:
X = cv.fit_transform(corpus).toarray()

In [160]:
# Encoding the Category(target ) variable using pd.get_dummies()
y = pd.get_dummies(df['Category'], drop_first=True) 

In [161]:
# splitting the data 
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.2, random_state=1)

## Model Creation

In [162]:
lr = LogisticRegression()

In [163]:
lr.fit(X_train, y_train)

  return f(**kwargs)


LogisticRegression()

In [164]:
y_pred = lr.predict(X_test)

In [165]:
matrix = confusion_matrix(y_test, y_pred)
matrix

array([[967,   1],
       [ 15, 132]], dtype=int64)

In [166]:
# 47 False Positive and 2 False Negatives

In [167]:
print(f'The Accuracy Score is {np.round(accuracy_score(y_test, y_pred), 2)}% ') 

The Accuracy Score is 0.99% 


### Using lemmatizer 

In [92]:
lemmatizer = WordNetLemmatizer()
corpus = []

In [93]:
for i in range(len(df['Message'])):
    words = re.sub('[^a-zA-z]', ' ', df['Message'][i]) # removing all special characters
    words = words.lower()  # lowering all characters
    words = words.split()  # spliting the sentence into words
    # applying lemmatization using WordNetLemmatizer
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]  
    words = ' '.join(words)
    corpus.append(words)  # appending it to a list

In [143]:
cv = CountVectorizer()  # bag of words
# for converting Message(string) into Numerical value

In [144]:
X = cv.fit_transform(corpus).toarray()

In [145]:
# Encoding the Category(target ) variable using pd.get_dummies()
y = pd.get_dummies(df['Category'], drop_first=True) 

In [146]:
# splitting the data 
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.2, random_state=1)

## Model Creation

In [147]:
lr = LogisticRegression()

In [148]:
lr.fit(X_train, y_train)

  return f(**kwargs)


LogisticRegression()

In [149]:
y_pred = lr.predict(X_test)

In [150]:
matrix = confusion_matrix(y_test, y_pred)
matrix

array([[967,   1],
       [ 15, 132]], dtype=int64)

In [151]:
# 32 False Positives and 1 False Negatives

In [152]:
print(f'The Accuracy Score is {np.round(accuracy_score(y_test, y_pred), 2)}% ') 

The Accuracy Score is 0.99% 


In [153]:
# Models Accuracy increased by 1% for applying lemmatization 

### Thank You