<a href="https://colab.research.google.com/github/NarnindiMounica/Data_Science/blob/main/Spam_Classifier_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Spam Classification-NLP

In [4]:
# Loading Data into dataframe

import pandas as pd
df = pd.read_csv('/content/sample_data/spam.csv', encoding='latin-1')

In [5]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
# Columns Unnamed: 2, Unnamed: 3, Unnamed: 4 are not required, hence dropping off
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [7]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# Changing column names to meaningful words
df.columns = ['label','message']

In [9]:
# Checking for null values
df.isnull().sum()

label      0
message    0
dtype: int64

In [10]:
# checking for duplicate rows

df.duplicated().sum()

403

In [11]:
df = df[~df.duplicated()]

In [12]:
df.shape

(5169, 2)

In [15]:
df.duplicated().sum()

0

In [17]:
# forming dependent and independent variables y and x

In [18]:
df['label'].value_counts()

ham     4516
spam     653
Name: label, dtype: int64

In [21]:
df['label'] = df['label'].map({'ham':0, 'spam':1})
df['label'].value_counts()

0    4516
1     653
Name: label, dtype: int64

In [22]:
y = df[['label']]

In [29]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [30]:
df['num_char'] = df['message'].apply(len)
df['num_words'] = df['message'].apply(lambda x : len(nltk.word_tokenize(x)))
df['num_sen'] = df['message'].apply(lambda x : len(nltk.sent_tokenize(x)))

In [31]:
df.head()

Unnamed: 0,label,message,num_char,num_words,num_sen
0,0,"Go until jurong point, crazy.. Available only ...",111,24,2
1,0,Ok lar... Joking wif u oni...,29,8,2
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,37,2
3,0,U dun say so early hor... U c already then say...,49,13,1
4,0,"Nah I don't think he goes to usf, he lives aro...",61,15,1


In [48]:
# Pre-processing function
import re
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [49]:
def pre_process(text):
  text = re.sub('[^a-zA-Z]',' ', text)
  text = text.lower()
  words = text.split()
  word = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
  text = ' '.join(word)
  return text

In [50]:
df['pre_processed_text'] = df['message'].apply(pre_process)

In [51]:
df.head(2)

Unnamed: 0,label,message,num_char,num_words,num_sen,pre_processed_text
0,0,"Go until jurong point, crazy.. Available only ...",111,24,2,go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,29,8,2,ok lar joking wif u oni


In [52]:
# Independent variables
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
count_vect = CountVectorizer()
tfidf = TfidfVectorizer()


In [53]:
x_bow = count_vect.fit_transform(df['pre_processed_text'])
x_tfidf = tfidf.fit_transform(df['pre_processed_text'])
x_bow = x_bow.toarray()
x_tfidf = x_tfidf.toarray()

In [57]:
# Train Test Split

from sklearn.model_selection import train_test_split
x_train_bow, x_test_bow, y_train_bow, y_test_bow = train_test_split(x_bow, y, random_state=1, test_size =0.2)
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(x_tfidf, y, random_state=1, test_size =0.2)

In [58]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
model_bow = nb.fit(x_train_bow, y_train_bow)
y_bow_pred = model_bow.predict(x_test_bow)


  y = column_or_1d(y, warn=True)


In [59]:
from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(y_test_bow, y_bow_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       899
           1       0.86      0.95      0.90       135

    accuracy                           0.97      1034
   macro avg       0.93      0.96      0.94      1034
weighted avg       0.97      0.97      0.97      1034



In [60]:
## using TFIDF
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
model_tfidf = nb.fit(x_train_tfidf, y_train_tfidf)
y_tfidf_pred = model_bow.predict(x_test_tfidf)


  y = column_or_1d(y, warn=True)


In [61]:
print(classification_report(y_test_tfidf, y_tfidf_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       899
           1       0.96      0.85      0.90       135

    accuracy                           0.98      1034
   macro avg       0.97      0.92      0.94      1034
weighted avg       0.98      0.98      0.98      1034

