In [9]:
import pandas as pd 

messages = pd.read_csv('SMSSpamCollection.txt',
        sep='\t',names=["label","message"])

In [10]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Text Cleaning and Pre-processing 

In [11]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\santj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [13]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z0-9]', ' ', messages['message'][i])
    # Substitutes anythis with an empty space Which is no
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    # Ignore all the stop words 
    review = ' '.join(review)
    corpus.append(review)
    # Join the words into respective sentences and add them into the corpus 

In [14]:
corpus[1:10]

['ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030']

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500,binary=True)
X = cv.fit_transform(corpus).toarray()

In [16]:
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values
# Making dummies of the labes and only selecting one column for labelling 

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [18]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [19]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score,classification_report

score=accuracy_score(y_test,y_pred)
print(score)

0.9838565022421525


In [21]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       0.99      0.99      0.99       966
        True       0.94      0.94      0.94       149

    accuracy                           0.98      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.98      0.98      0.98      1115



<h3>Using TFI-DF
<h5>It gives more weight to most occuring words
<h6>-Max features is only used to select top 'n' most occuring words
<h6>-binary=true means you only want presence of words not its frequency in a sentence and it also helps in reduceing dimensions 
<h6>-binary= False means you also want the frequency of the words in each sentence to get more accurate results

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500)
X = tv.fit_transform(corpus).toarray()

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [24]:
#prediction
y_pred=spam_detect_model.predict(X_test)
score=accuracy_score(y_test,y_pred)
print(score)

0.9847533632286996


In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.98      0.99       972
        True       0.89      1.00      0.94       143

    accuracy                           0.98      1115
   macro avg       0.95      0.99      0.97      1115
weighted avg       0.99      0.98      0.99      1115



In [26]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)


-> gensim is a library for topic modeling and document similarity analysis. 


-> simple_preprocess is a function from gensim used for tokenizing a document into a list of lowercase tokens.

-> In tokenize you get the meaning of the sentence because the sentence is divided in such a way that their sequence makes sense

sentences = sent_tokenize(text)  # ["I love programming.", "Do you?"]

tokenized_words = [simple_preprocess(sentence) for sentence in sentences]

[['i', 'love', 'programming'], ['do', 'you']]

In [27]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))
corpus = [' '.join(sentence) for sentence in words]

In [28]:
X = tv.fit_transform(corpus).toarray()

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [5564, 5572]

In [None]:
y_pred=spam_detect_model.predict(X_test)
score=accuracy_score(y_test,y_pred)
print(score)

0.9847533632286996
