In [9]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os
import csv
import string
import requests
import io
import nltk
from zipfile import ZipFile
from sklearn.feature_extraction.text import TfidfVectorizer 
sess = tf.Session()
%matplotlib inline

In [10]:
batch_size=200
max_features = 1000

In [11]:
save_file_name = os.path.join('smsspamcollection','SMSSpamCollection.csv')
if os.path.isfile(save_file_name):
    text_data = []
    with open(save_file_name, 'r') as temp_output_file:
        reader = csv.reader(temp_output_file)
        for row in reader:
            text_data.append(row)
else:
    zip_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
    r = requests.get(zip_url)
    z = ZipFile(io.BytesIO(r.content))
    file = z.read('SMSSpamCollection')
    #Format Data
    text_data = file.decode()
    text_data = text_data.encode('ascii',errors='ignore')
    text_data = text_data.decode().split('\n')
    text_data = [x.split('\t') for x in text_data if len(x)>=1]
    
    #And write to csv
    with open(save_file_name, 'w') as temp_output_file:
        writer = csv.writer(temp_output_file)
        writer.writerows(text_data)
texts = [x[1] for x in text_data]
target = [x[0] for x in text_data]
#Relabel 'spam' as 1 'ham' as 0
target = [1 if x=='spam' else 0 for x in target]

In [12]:
#To reduce the potential vocabulary size, we normalize the text. To do this, we remove the influence of capitalization
#and numbers in the text. 

#Convert to lower case
texts = [x.lower() for x in texts]
#Remove punctuation
texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
#Remove numbers
texts = [''.join(c for c in x if c not in '0123456789') for x in texts]
#Trim extra whitespace 
texts = [' '.join(x.split()) for x in texts]

In [8]:
#In order to use scikit-learn's TF-IDF processing functions, we have to tell it how to tokenize our sentences. By this,
#we just mean how to break up a sentence into the corresponding words. A great tokenizer is already built for us in the
#nltk package that does a great job of breaking up sentences into the corresponding words
def tokenizer(text):
    words = nltk.word_tokenize(text)
    return words
#Create TF-IDF of texts
tfidf = TfidfVectorizer(tokenizer = tokenizer, stop_words='english',max_features=max_features)
sparse_tfidf_texts = tfidf.fit_transform(texts)

In [13]:
#Next we break up our data set into a train and test set. 
train_indices = np.random.choice(sparse_tfidf_texts.shape[0],round(0.8*sparse_tfidf_texts.shape[0]),replace=False)
test_indices = np.array(list(set(range(sparse_tfidf_texts.shape[0]))- set(train_indices)))
texts_train = sparse_tfidf_texts[train_indices]
texts_test = sparse_tfidf_texts[test_indices]
target_train = np.array([x for ix, x in enumerate(target) if ix in train_indices])
target_test = np.array([x for ix, x in enumerate(target) if ix in test_indices])

In [14]:
#Now we can declare our model variables for logistic regression and our data placeholders: 
A = tf.Variable(tf.random_normal(shape=[max_features,1]))
b = tf.Variable(tf.random_normal(shape=[1,1]))
#Initialize placeholders
x_data = tf.placeholder(dtype=tf.float32, shape=[None,max_features])
y_target = tf.placeholder(dtype=tf.float32, shape=[None,1])

In [15]:
#We can now declare the model operations and the loss function. Remember that the sigmoid part of the logistic regression
#is in our loss function. 
model_output = tf.add(tf.matmul(x_data,A),b)
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y_target,logits=model_output))

In [16]:
#Now we add a prediction and accuracy function to the graph so that we can see the accuracy of the train and test set
#as our model is training
prediction = tf.round(tf.sigmoid(model_output))
predictions_correct = tf.cast(tf.equal(prediction, y_target),tf.float32)
accuracy = tf.reduce_mean(predictions_correct)

In [17]:
#We declare an optimizer and initialize our graph variables next
my_opt = tf.train.GradientDescentOptimizer(0.0025)
train_step = my_opt.minimize(loss)
#Initalize variables
init = tf.global_variables_initializer()
sess.run(init)

In [18]:
#We now train our model over 10,000 generations and record the test/train loss and accuracy every 100 generations and 
#print out the status every 500 generations. 
train_loss = []
test_loss = []
train_acc = []
test_acc = []
i_data = []
for i in range(10000):
    rand_index = np.random.choice(texts_train.shape[0],size=batch_size)
    rand_x = texts_train[rand_index].todense()
    rand_y = np.transpose([target_train[rand_index]])
    sess.run(train_step, feed_dict={x_data:rand_x,y_target:rand_y})
    
    #Only record loss and accuracy every 100 generations 
    if(i+1)%100 ==0:
        i_data.append(i+1)
        
        train_loss_temp = sess.run(loss, feed_dict={x_data:rand_x,y_target:rand_y})
        train_loss.append(train_loss_temp)
        
        test_loss_temp = sess.run(loss, feed_dict={x_data:texts_test.todense(), y_target:np.transpose([target_test])})
        test_loss.append(test_loss_temp)
        
        train_acc_temp = sess.run(accuracy, feed_dict={x_data:rand_x, y_target:rand_y})
        train_acc.append(train_acc_temp)
        
        test_acc_temp = sess.run(accuracy, feed_dict={x_data:texts_test.todense(), y_target:np.transpose([target_test])})
        test_acc.append(test_acc_temp)
    
    if(i+1)%500==0:
        acc_and_loss = [i+1, train_loss_temp, test_loss_temp, train_acc_temp, test_acc_temp]
        acc_and_loss = [np.round(x,2) for x in acc_and_loss]
        print('Generation # {}. Train Loss (Test Loss): {:.2f} ({:.2f}). Train Acc (Test Acc): {:.2f} ({:.2f})'.format(*acc_and_loss))

Generation # 500. Train Loss (Test Loss): 0.69 (0.70). Train Acc (Test Acc): 0.57 (0.57)
Generation # 1000. Train Loss (Test Loss): 0.61 (0.61). Train Acc (Test Acc): 0.70 (0.68)
Generation # 1500. Train Loss (Test Loss): 0.51 (0.56). Train Acc (Test Acc): 0.76 (0.74)
Generation # 2000. Train Loss (Test Loss): 0.51 (0.52). Train Acc (Test Acc): 0.77 (0.77)
Generation # 2500. Train Loss (Test Loss): 0.56 (0.50). Train Acc (Test Acc): 0.78 (0.80)
Generation # 3000. Train Loss (Test Loss): 0.44 (0.49). Train Acc (Test Acc): 0.82 (0.81)
Generation # 3500. Train Loss (Test Loss): 0.45 (0.48). Train Acc (Test Acc): 0.86 (0.82)
Generation # 4000. Train Loss (Test Loss): 0.44 (0.48). Train Acc (Test Acc): 0.82 (0.82)
Generation # 4500. Train Loss (Test Loss): 0.46 (0.47). Train Acc (Test Acc): 0.82 (0.83)
Generation # 5000. Train Loss (Test Loss): 0.49 (0.47). Train Acc (Test Acc): 0.84 (0.84)
Generation # 5500. Train Loss (Test Loss): 0.47 (0.47). Train Acc (Test Acc): 0.84 (0.84)
Generation 