# using TF_IDF NLP Technique and random forest classifier to determine spam-non spam text

### Read In & Clean Text

In [39]:
pip install nltk



In [41]:
# Read in and clean data
import nltk
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import string






> Indented block read files



In [42]:
import sys
#for file reading
from google.colab import files
import io

uploaded = files.upload()
messages = pd.read_csv(io.BytesIO(uploaded["spam.csv"]),  encoding='latin-1')
messages.describe()

Saving spam.csv to spam (2).csv


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [8]:
messages.head

<bound method NDFrame.head of         v1  ... Unnamed: 4
0      ham  ...        NaN
1      ham  ...        NaN
2     spam  ...        NaN
3      ham  ...        NaN
4      ham  ...        NaN
...    ...  ...        ...
5567  spam  ...        NaN
5568   ham  ...        NaN
5569   ham  ...        NaN
5570   ham  ...        NaN
5571   ham  ...        NaN

[5572 rows x 5 columns]>

In [44]:
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)

messages.columns = ["label", "text"]
messages['label'] = np.where(messages['label']=='spam', 1, 0)
messages.head(3)

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [45]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [46]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#stop_words = set(stopwords.words('english'))
stopwords = nltk.corpus.stopwords.words('english')


def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

messages['clean_text'] = messages['text'].apply(lambda x: clean_text(x))
messages.head()

Unnamed: 0,label,text,clean_text
0,0,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,0,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,0,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


In [47]:
# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(messages['clean_text'],
                                                    messages['label'], test_size=0.2)

In [48]:
# What do the first ten messages in the training set look like?
X_train[:]

1131                            [sorry, ill, call, later]
3546    [rock, yr, chik, get, 100s, filthy, films, xxx...
1134    [u, wake, already, thanx, 4, e, tau, sar, piah...
2378    [hi, mobile, ltgt, added, contact, list, wwwfu...
4080                            [hurry, home, soup, done]
                              ...                        
794     [generally, isnt, one, uncountable, noun, u, d...
386                               [customer, place, call]
699                     [ku, also, dont, msg, reply, msg]
5441    [way, make, sure, u, get, train, worc, foregat...
4061           [hi, dear, saw, dear, happy, battery, low]
Name: clean_text, Length: 4457, dtype: object

In [49]:
# What do the labels look like?
y_train[:10]

1131    0
3546    1
1134    0
2378    0
4080    0
5112    1
2954    0
1773    0
708     1
3538    0
Name: label, dtype: int64

In [50]:
# Let's save the training and test sets to ensure we are using the same data for each model

X_train.to_csv('X_train.csv', index=False, header=True)
X_test.to_csv('X_test.csv', index=False, header=True)
y_train.to_csv('y_train.csv', index=False, header=True)
y_test.to_csv('y_test.csv', index=False, header=True)

In [51]:
X_train.head()

1131                            [sorry, ill, call, later]
3546    [rock, yr, chik, get, 100s, filthy, films, xxx...
1134    [u, wake, already, thanx, 4, e, tau, sar, piah...
2378    [hi, mobile, ltgt, added, contact, list, wwwfu...
4080                            [hurry, home, soup, done]
Name: clean_text, dtype: object

In [52]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

In [53]:
X_train.head()

Unnamed: 0,clean_text
0,"['sorry', 'ill', 'call', 'later']"
1,"['rock', 'yr', 'chik', 'get', '100s', 'filthy'..."
2,"['u', 'wake', 'already', 'thanx', '4', 'e', 't..."
3,"['hi', 'mobile', 'ltgt', 'added', 'contact', '..."
4,"['hurry', 'home', 'soup', 'done']"


### Create TF-IDF Vectors

In [54]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])


In [55]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'sorry': 6659,
 'ill': 3789,
 'call': 1683,
 'later': 4250,
 'rock': 6116,
 'yr': 8159,
 'chik': 1883,
 'get': 3236,
 '100s': 255,
 'filthy': 2964,
 'films': 2962,
 'xxx': 8068,
 'pics': 5489,
 'phone': 5472,
 'rply': 6148,
 'filth': 2963,
 '69669': 639,
 'saristar': 6225,
 'ltd': 4477,
 'e14': 2624,
 '9yt': 802,
 '08701752560': 72,
 '450p': 553,
 'per': 5426,
 'days': 2300,
 'stop2': 6834,
 'cancel': 1722,
 'wake': 7705,
 'already': 986,
 'thanx': 7152,
 'tau': 7062,
 'sar': 6220,
 'piah': 5483,
 'quite': 5838,
 'nice': 5014,
 'hi': 3573,
 'mobile': 4791,
 'ltgt': 4481,
 'added': 868,
 'contact': 2087,
 'list': 4369,
 'wwwfullonsmscom': 8018,
 'great': 3369,
 'place': 5516,
 'send': 6330,
 'free': 3096,
 'sms': 6590,
 'people': 5425,
 'visit': 7651,
 'fullonsmscom': 3158,
 'hurry': 3739,
 'home': 3624,
 'soup': 6674,
 'done': 2533,
 'december': 2332,
 '11mths': 273,
 'entitled': 2729,
 'update': 7532,
 'latest': 4252,
 'colour': 2008,
 'camera': 1715,
 'vco': 7604,
 '08002986906': 51

In [56]:

# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [57]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [58]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.83 / Accuracy: 0.977
