In [3]:

from __future__ import division
import tensorflow as tf
import numpy as np
import tarfile
import os
import matplotlib.pyplot as plt
import time

In [4]:
def csv_to_numpy_array(filePath, delimiter):
    return np.genfromtxt(filePath, delimiter=delimiter, dtype='float32')

def import_data():
    if "data" not in os.listdir(os.getcwd()):
        # Untar directory of data if we haven't already
        tarObject = tarfile.open("data.tar.gz")
        tarObject.extractall()
        tarObject.close()
        print("Extracted tar to current directory")
    else:
        # we've already extracted the files
        pass

    print("loading training data")
    trainX = csv_to_numpy_array("data/trainX.csv", delimiter=",")
    trainY = csv_to_numpy_array("data/trainY.csv", delimiter=",")
    print("loading test data")
    testX = csv_to_numpy_array("data/testX.csv", delimiter=",")
    testY = csv_to_numpy_array("data/testY.csv", delimiter=",")
    return trainX,trainY,testX,testY

trainX,trainY,testX,testY = import_data()

loading training data
loading test data


In [5]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

In [6]:
trainX.shape

(2250, 3752)

In [7]:

num_hidden_nodes1 = 2000
num_hidden_nodes2 = 1000
num_hidden_nodes3 = 256
keep_prob = 0.5
# numFeatures = the number of words extracted from each email
numFeatures = trainX.shape[1]
# numLabels = number of classes we are predicting (here just 2: Spam or Ham)
numLabels = trainY.shape[1]

graph = tf.Graph()
with graph.as_default():

  # Input data.
  
  tf_train_dataset = tf.constant(trainX)
  tf_train_labels = tf.constant(trainY)
  tf_test_dataset = tf.constant(testX)

  # Single mail input. 
  tf_mail = tf.placeholder(tf.float32, shape=(1, numFeatures))
  

  # Variables.
  weights1 = tf.Variable(tf.truncated_normal([numFeatures, num_hidden_nodes1],
        stddev=np.sqrt(2.0 / (numFeatures))),name="v1")
         
  biases1 = tf.Variable(tf.zeros([num_hidden_nodes1]),name="v2")
  weights2 = tf.Variable(
    tf.truncated_normal([num_hidden_nodes1, num_hidden_nodes2], stddev=np.sqrt(2.0 / num_hidden_nodes1)),name="v3")
  biases2 = tf.Variable(tf.zeros([num_hidden_nodes2]),name="v4")
  weights3 = tf.Variable(
       tf.truncated_normal([num_hidden_nodes2, num_hidden_nodes3], stddev=np.sqrt(2.0 / num_hidden_nodes2)),name="v5")
  biases3 = tf.Variable(tf.zeros([num_hidden_nodes3]),name="v6")
  weights4 = tf.Variable(
    tf.truncated_normal([num_hidden_nodes3, numLabels], stddev=np.sqrt(2.0 / num_hidden_nodes3)),name="v7")
  biases4 = tf.Variable(tf.zeros([numLabels]),name="v8")
  
  # Add ops to save and restore all the variables.
  saver = tf.train.Saver()
    
  # Training computation.
  layer1_train = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
  drop1 = tf.nn.dropout(layer1_train, keep_prob)
  layer2_train = tf.nn.relu(tf.matmul(drop1, weights2) + biases2)
  drop2 = tf.nn.dropout(layer2_train, keep_prob)
  layer3_train = tf.nn.relu(tf.matmul(drop2, weights3) + biases3)
  drop3 = tf.nn.dropout(layer3_train, keep_prob)
  logits = tf.matmul(drop3, weights4) + biases4
  loss = tf.reduce_mean(
     tf.nn.sigmoid_cross_entropy_with_logits(logits, tf_train_labels))
  
  # Optimizer.
  optimizer = tf.train.AdamOptimizer(learning_rate=0.1, 
                                              beta1=0.9, beta2=0.999, 
                                              epsilon=1e-08).minimize(loss)
  
  # Predictions for the training, test data, and single mail.
    
  train_prediction = tf.nn.sigmoid(logits)
  
  layer1_test = tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1)
  layer2_test = tf.nn.relu(tf.matmul(layer1_test, weights2) + biases2)
  layer3_test = tf.nn.relu(tf.matmul(layer2_test, weights3) + biases3)
  test_prediction = tf.nn.sigmoid(tf.matmul(layer3_test, weights4) + biases4)
  
  layer1_mail = tf.nn.relu(tf.matmul(tf_mail, weights1) + biases1)
  layer2_mail = tf.nn.relu(tf.matmul(layer1_mail, weights2) + biases2)
  layer3_mail = tf.nn.relu(tf.matmul(layer2_mail, weights3) + biases3)
  prediction_mail = tf.nn.sigmoid(tf.matmul(layer3_mail, weights4) + biases4)

In [6]:
num_steps = 151
start = time.time()

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    
    
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction])
    acc = accuracy(predictions,trainY)
    if (step % 10 == 0):
      print("Loss at step %d: %f" % (step, l))
      print("Accuracy: %.1f%%" % acc)
      
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), testY))

  # Save the variables to disk.
  save_path = saver.save(session, "./model.ckpt")
  print("Model saved in file: %s" % save_path)

end = time.time()
duration = end - start
print("time consumed in training: %f seconds" % duration)

Initialized
Loss at step 0: 0.694168
Accuracy: 34.6%
Loss at step 10: 0.571108
Accuracy: 62.0%
Loss at step 20: 0.241275
Accuracy: 95.5%
Loss at step 30: 0.128373
Accuracy: 98.8%
Loss at step 40: 0.025614
Accuracy: 99.2%
Loss at step 50: 0.030443
Accuracy: 99.6%
Loss at step 60: 0.007038
Accuracy: 99.9%
Loss at step 70: 0.011091
Accuracy: 99.7%
Loss at step 80: 0.003612
Accuracy: 99.8%
Loss at step 90: 0.005343
Accuracy: 99.8%
Loss at step 100: 0.002600
Accuracy: 100.0%
Loss at step 110: 0.002260
Accuracy: 100.0%
Loss at step 120: 0.003292
Accuracy: 99.9%
Loss at step 130: 0.007362
Accuracy: 99.8%
Loss at step 140: 0.002222
Accuracy: 99.9%
Loss at step 150: 0.006215
Accuracy: 99.9%
Test accuracy: 99.6%
Model saved in file: ./model.ckpt
time consumed in training: 983.865948 seconds


In [9]:
import cPickle
from collections import Counter
# Load Bag of words that were in training data 
BagOfWords = cPickle.load(open('BagOfWords.p', 'rb'))
features = set(BagOfWords)
featureDict = {feature:i for i,feature in enumerate(features)}


In [10]:
def get_feature_vector(email_text, featureDict):
    '''
    create feature/x vector from email text 
    row = email, cols = features
    '''
    featurevector = np.zeros(shape=(1,len(featureDict)),dtype=float)

    
    tokens = email_text.split()
    UniDist = Counter(tokens)
    for key,value in UniDist.items():
        if key in featureDict:
            featurevector[0,featureDict[key]] = value
    return featurevector


In [11]:
def regularize_vectors(featurevector):
    '''
    Input:
      featurevector: vector, where single email is a row and features are columns
    Returns:
      featurevector: vector, updated by dividing each feature value by the total
      number of features 
    '''
    
    totalWords = np.sum(featurevector[0,:],axis=0)
    featurevector[0,:] = np.multiply(featurevector[0,:],(1/(totalWords + 1e-5)))
    return featurevector


## Enter your username and password
If ur email is a.b@gmail.com , only enter a.b 

And before running the cell below, u need to enable IMAP from [HERE](https://support.google.com/mail/answer/7126229?hl=en&visit_id=1-636175756291815919-2042916920&rd=1) 

First time Gmail rejected the loggin , and sent mail to  my inbox reviewing the block and asked me to allow loggin or not .

And u just need to allow Less secure apps from [HERE](https://www.google.com/settings/security/lesssecureapps?rfn=27&rfnc=1&et=0&asae=2&anexp=ire-f3)

![Gmail](picture.png)

In [59]:
import email, getpass, imaplib
user = raw_input("Enter your GMail username --> ")
pwd = getpass.getpass("Enter your password --> ")
m = imaplib.IMAP4_SSL("imap.gmail.com")
m.login(user, pwd)
print('OK, %s@gmail.com  authenticated (Success)' % user)

Enter your GMail username --> mo.ab.elkasaby
Enter your password --> ········
OK, mo.ab.elkasaby@gmail.com  authenticated (Success)


In [60]:
def get_text(mail,i):
    result, data = mail.uid('search', None, "ALL")
    # search and return uids instead
    latest_email_uid = data[0].split()[i] # unique ids wrt label selected
    result, email_data = mail.uid('fetch', latest_email_uid, '(RFC822)')
    # fetch the email body (RFC822) for the given ID
    raw_email = email_data[0][1]
    raw_email_string = raw_email.decode('utf-8')
    # converts byte literal to string removing b''
    email_message = email.message_from_string(raw_email_string)
    # this will loop through all the available multiparts in mail
    for part in email_message.walk():
       if part.get_content_type() == "text/plain": # ignore attachments/html
          body = part.get_payload(decode=True)
    
    return str(body)

In [61]:

with tf.Session(graph=graph) as session:
    m.list() # Lists all labels in GMail
    m.select('Inbox') # Connected to 'Inbox' or '[Gmail]/Spam'
    for i in range(0,7,1):
        email_text = get_text(m,i)
        print(email_text[:9])
        email_test = get_feature_vector(email_text,featureDict)
        email_test = regularize_vectors(email_test)
        # Restore variables from disk.
        saver.restore(session, "./model.ckpt")
        #print("Model restored.")
        # Do some work with the model
        feed_dict={tf_mail:email_test}
        emailpred = session.run(prediction_mail,feed_dict=feed_dict)
        #check on the first column of the single row 
        print(emailpred[0])
        if emailpred[0][0] > emailpred[0][1] :
            print("Spam")
        else:
            print("Not Spam")

eng.vultu
[  1.00000000e+00   5.35245217e-23]
Spam
�� ����� 
[ 0.  1.]
Not Spam
�� ����� 
[ 0.  1.]
Not Spam
شاهد 
[  1.34646243e-07   9.99999762e-01]
Not Spam
������ ��
[ 0.  1.]
Not Spam
Add Moham
[  1.21050980e-05   9.99984980e-01]
Not Spam
mohamed� 
[ 0.  1.]
Not Spam


In [67]:
m.list() # Lists all labels in GMail
m.select('[Gmail]/Spam') # Connected to 'Inbox' or '[Gmail]/Spam'
for i in range(-1,-3,-1):
    email_text = get_text(m,i)
    #print(email_text)
    email_test = get_feature_vector(email_text,featureDict)
    email_test = regularize_vectors(email_test)
    with tf.Session(graph=graph) as session:
        # Restore variables from disk.
        saver.restore(session, "./model.ckpt")
        #print("Model restored.")
        # Do some work with the model
        feed_dict={tf_mail:email_test}
        emailpred = session.run(prediction_mail,feed_dict=feed_dict)
        #check on the first column of the single row
        print(emailpred[0])
        if emailpred[0][0] > emailpred[0][1] :
            print("Spam")
        else:
            print("Not Spam")

[  1.00000000e+00   6.59820776e-09]
Spam
[ 1.  0.]
Spam


## Logic used
I used One-hot encoding which means that one column is One and other is Zero.

prediction consists of two columns [Spam, Not Spam] , the prediction is normalized probability if the first column is greater than the second then it is 'Spam' and else is 'Not Spam'

In [51]:
testY[0]

array([ 1.,  0.], dtype=float32)