### Spam detection in nlp
I get to understand fundamental NLP concepts such as stemming, lemmatization, stop words, tokenization and more!

In [37]:
# # Final Step - ML Model

# ### Read in text

# In[3]:


import pandas as pd
import re
import string
import nltk
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

In [38]:
# ### Create function to remove punctuation, tokenize, remove stopwords, and stem

# In[4]:


def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text


In [40]:
# ### Apply CountVectorizer

# In[5]:


from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(data['body_text'])
print(X_counts.shape)
#print(count_vect.get_feature_names())
print(X_counts)



(5567, 8104)
  (0, 3134)	1
  (0, 2790)	2
  (0, 436)	1
  (0, 7816)	1
  (0, 2120)	1
  (0, 7782)	1
  (0, 2909)	2
  (0, 2288)	1
  (0, 3011)	1
  (0, 7168)	1
  (0, 456)	1
  (0, 4640)	1
  (0, 443)	1
  (0, 7027)	1
  (0, 879)	1
  (0, 5917)	1
  (0, 5829)	1
  (0, 7350)	1
  (0, 5876)	1
  (0, 1228)	1
  (0, 73)	1
  (1, 4931)	1
  (1, 2586)	1
  (1, 7095)	1
  (1, 3332)	1
  :	:
  (5563, 3320)	1
  (5563, 8101)	1
  (5563, 3123)	1
  (5563, 2818)	1
  (5564, 6830)	1
  (5564, 4833)	1
  (5564, 5528)	1
  (5564, 6528)	1
  (5565, 3134)	1
  (5565, 4369)	1
  (5565, 7693)	1
  (5565, 5015)	1
  (5565, 7473)	1
  (5565, 6550)	1
  (5565, 1776)	1
  (5565, 2748)	1
  (5565, 3239)	1
  (5565, 3462)	1
  (5565, 3801)	1
  (5565, 3916)	1
  (5565, 997)	1
  (5565, 1564)	1
  (5566, 4937)	1
  (5566, 7306)	1
  (5566, 6070)	1


In [41]:
# # Seperating Dependent and Independent Variable

# In[7]:


X = X_counts.toarray()
y = data.iloc[:,0].values
print(y)



['spam' 'ham' 'ham' ... 'ham' 'ham' 'ham']


In [42]:
# # Encoding The Dependent Variable

# In[9]:


#To deal with categorical data we had to convert it into numbers
from sklearn.preprocessing import LabelEncoder
labelencoder_y=LabelEncoder()
#converting first column into integer values
y=labelencoder_y.fit_transform(y)
print(y)

[1 0 0 ... 0 0 0]


In [43]:
# # Splitting The Dataset

# In[10]:


# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)



In [44]:
# # Using Naive Bayes Classifier

# In[13]:


# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)


In [45]:
# # Predicting Results

# In[14]:


# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [47]:
# # Confusion Matrix

# In[15]:


# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[841 118]
 [ 12 143]]


In [48]:
(841+143)/(841+143+118+12)

0.8833034111310593