## NLP Preprocessing

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import csv

In [2]:
def preprocessing(text):
    #text = text.decode('utf8')
    
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    
    # stop word removal
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]
    
    # removal words less than three letters
    tokens = [word for word in tokens if len(word) >=3]
    
    # lower capitalization
    tokens = [word.lower() for word in tokens]
    
    # lemmatize
    lem = WordNetLemmatizer()
    tokens = [lem.lemmatize(word) for word in tokens]
    
    preprocessed_text = ' ' .join(tokens)
    
    return preprocessed_text

In [3]:
with open('SMSSpamCollection') as sms:
    sms_data = []
    sms_labels = []
    
    csv_reader = csv.reader(sms, delimiter='\t')
    for line in csv_reader:
        # adding the label
        sms_labels.append(line[0])
    
        # adding the data
        sms_data.append(preprocessing(line[1]))

In [4]:
len(sms_labels)

5572

In [5]:
len(sms_data)

5572

## Machine Learning Model

In [15]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    sms_data, sms_labels, test_size=0.30, random_state=42)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    min_df=2, 
    ngram_range=(1, 2),  
    stop_words='english',  
    strip_accents='unicode',  
    norm='l2')

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [18]:
print('Train shape: ', X_train.shape)
print('Test shape: ', X_test.shape)

Train shape:  (3900, 5888)
Test shape:  (1672, 5888)


### Decision trees

In [24]:
from sklearn import tree

clf_tree = tree.DecisionTreeClassifier().fit(X_train.toarray(), y_train)
y_tree_predict = clf_tree.predict(X_test.toarray())

print('Classification report: ')
print(classification_report(y_test, y_tree_predict))
print('Confusion matrix: ')
print(confusion_matrix(y_test, y_tree_predict))

Classification report: 
             precision    recall  f1-score   support

        ham       0.97      0.99      0.98      1448
       spam       0.92      0.82      0.87       224

avg / total       0.97      0.97      0.97      1672

Confusion matrix: 
[[1433   15]
 [  41  183]]


### Random forest algorithm

In [27]:
from sklearn.ensemble import RandomForestClassifier

clf_rfc = RandomForestClassifier(n_estimators=10).fit(X_train.toarray(), y_train)
y_rfc_predict = clf_rfc.predict(X_test.toarray())

print('Classification report: ')
print(classification_report(y_test, y_rfc_predict))
print('Confusion matrix: ')
print(confusion_matrix(y_test, y_rfc_predict))

Classification report: 
             precision    recall  f1-score   support

        ham       0.98      1.00      0.99      1448
       spam       0.99      0.85      0.92       224

avg / total       0.98      0.98      0.98      1672

Confusion matrix: 
[[1446    2]
 [  33  191]]


### Support vector machines

In [25]:
from sklearn.svm import LinearSVC

clf_svc = LinearSVC()
clf_svc.fit(X_train, y_train)
y_predict = clf_svc.predict(X_test)

print('Classification report: ')
print(classification_report(y_test, y_predict))
print('Confusion matrix: ')
print(confusion_matrix(y_test, y_predict))

Classification report: 
             precision    recall  f1-score   support

        ham       0.99      1.00      0.99      1448
       spam       0.99      0.93      0.96       224

avg / total       0.99      0.99      0.99      1672

Confusion matrix: 
[[1445    3]
 [  16  208]]


## Tensorflow multi-layer neural network

In [84]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

def one_hot_transform(label_data):
    l_enc = LabelEncoder()
    l_enc.fit(label_data)
    one_hot_label = OneHotEncoder(sparse=False).fit_transform(l_enc.transform(label_data).reshape(-1,1))
    return one_hot_label

new_y_train = one_hot_transform(y_train)
new_y_test = one_hot_transform(y_test)

In [85]:
print(X_train.shape)
print(X_test.shape)

print(new_y_train.shape)
print(new_y_test.shape)

(3900, 5888)
(1672, 5888)
(3900, 2)
(1672, 2)


(3900, 5888)

In [107]:
import tensorflow as tf
import numpy as np

train_x,train_y,test_x,test_y = X_train.toarray(), new_y_train, X_test.toarray(), new_y_test

n_nodes_hl1 = 1500
n_nodes_hl2 = 1500
n_nodes_hl3 = 1500

n_classes = 2
batch_size = 100
hm_epochs = 10

x = tf.placeholder('float')
y = tf.placeholder('float')

hidden_1_layer = {'f_fum':n_nodes_hl1,
                  'weight':tf.Variable(tf.random_normal([len(train_x[0]), n_nodes_hl1])),
                  'bias':tf.Variable(tf.random_normal([n_nodes_hl1]))}

hidden_2_layer = {'f_fum':n_nodes_hl2,
                  'weight':tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])),
                  'bias':tf.Variable(tf.random_normal([n_nodes_hl2]))}

hidden_3_layer = {'f_fum':n_nodes_hl3,
                  'weight':tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])),
                  'bias':tf.Variable(tf.random_normal([n_nodes_hl3]))}

output_layer = {'f_fum':None,
                'weight':tf.Variable(tf.random_normal([n_nodes_hl3, n_classes])),
                'bias':tf.Variable(tf.random_normal([n_classes])),}


# Nothing changes
def neural_network_model(data):

    l1 = tf.add(tf.matmul(data,hidden_1_layer['weight']), hidden_1_layer['bias'])
    l1 = tf.nn.relu(l1)

    l2 = tf.add(tf.matmul(l1,hidden_2_layer['weight']), hidden_2_layer['bias'])
    l2 = tf.nn.relu(l2)

    l3 = tf.add(tf.matmul(l2,hidden_3_layer['weight']), hidden_3_layer['bias'])
    l3 = tf.nn.relu(l3)

    output = tf.matmul(l3,output_layer['weight']) + output_layer['bias']

    return output

def train_neural_network(x):
    prediction = neural_network_model(x)
    cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y) )
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(hm_epochs):
            epoch_loss = 0
            i=0
            while i < len(train_x):
                start = i
                end = i+batch_size
                batch_x = train_x[start:end]
                batch_y = train_y[start:end]

                _, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y})
                epoch_loss += c
                i+=batch_size
                
            print('Epoch', epoch+1, 'completed out of',hm_epochs,'loss:',epoch_loss)
        correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))

        print('Accuracy:',accuracy.eval({x:test_x, y:test_y}))
  
train_neural_network(x)

Epoch 1 completed out of 10 loss: 192468.23510742188
Epoch 2 completed out of 10 loss: 27505.369210004807
Epoch 3 completed out of 10 loss: 11215.385077953339
Epoch 4 completed out of 10 loss: 3937.876480460167
Epoch 5 completed out of 10 loss: 1823.24502658844
Epoch 6 completed out of 10 loss: 304.07083892822266
Epoch 7 completed out of 10 loss: 230.83144998550415
Epoch 8 completed out of 10 loss: 158.57072269916534
Epoch 9 completed out of 10 loss: 129.63890075683594
Epoch 10 completed out of 10 loss: 97.39441734552383
Accuracy: 0.95873207
