In [1]:
!rm -rf metastore_db/
from pyspark.sql import SQLContext, Row
sqlCtx = SQLContext(sc)

In [2]:
def tokenize(s):
    import re
    stopwords = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'])
    word_regex = '^[a-z][a-z\'-]+[a-z]$'
    s = s.lower()
    arr = s.split()
    terms = []
    for term in arr:
        if re.match(word_regex, term) != None and len(term) > 3 and term not in stopwords:
            terms.append(term)
    return terms


In [3]:
test_strings = ['the quick brown fox jumps over the brown fence.',
              'the boy paints a tall fence brown!',
              'basketball players are tall.',
              'quick basketball players jump high']

In [4]:
tokens = sc.parallelize(test_strings).map(tokenize)

In [5]:
tokens.collect()

[['quick', 'brown', 'jumps', 'brown'],
 ['paints', 'tall', 'fence'],
 ['basketball', 'players'],
 ['quick', 'basketball', 'players', 'jump', 'high']]

In [6]:
vocab = tokens.flatMap(lambda words: words).distinct()
vocab.collect()

['quick',
 'fence',
 'players',
 'jump',
 'high',
 'jumps',
 'paints',
 'basketball',
 'tall',
 'brown']

In [7]:
from collections import Counter
import numpy as np

#sc.broadcast shares an immutable object throughout the cluster
broadcastVocab = sc.broadcast(vocab.collect())

def bow_vectorize(tokens):
    word_counts = Counter(tokens)
    vector = [word_counts[v] if v in word_counts else 0 for v in broadcastVocab.value]
    return np.array(vector)

In [8]:
# create a Bag of Words representation of our document collection
#TODO: MORE EFFICIENT IS TO USE SPARSE REPRESENTATION
bow = tokens.map(bow_vectorize).cache()
bow.collect()

[array([1, 0, 0, 0, 0, 1, 0, 0, 0, 2]),
 array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0]),
 array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0]),
 array([1, 0, 1, 1, 1, 0, 0, 1, 0, 0])]

In [9]:
broadcastVocab.value

['quick',
 'fence',
 'players',
 'jump',
 'high',
 'jumps',
 'paints',
 'basketball',
 'tall',
 'brown']

In [10]:
# TF is the number of occurences of each term in each document
term_freq = tokens.map(lambda terms: Counter(terms))
print(term_freq.collect())

[Counter({'brown': 2, 'quick': 1, 'jumps': 1}), Counter({'fence': 1, 'paints': 1, 'tall': 1}), Counter({'players': 1, 'basketball': 1}), Counter({'quick': 1, 'players': 1, 'high': 1, 'basketball': 1, 'jump': 1})]


In [11]:
# DF counts the number of documents in which each term occurs
doc_freq = term_freq.flatMap(lambda counts: counts.keys()).map(lambda keys: (keys, 1)).reduceByKey(lambda a, b: a + b)
doc_freq.collect()

[('quick', 2),
 ('fence', 1),
 ('players', 2),
 ('jump', 1),
 ('high', 1),
 ('jumps', 1),
 ('paints', 1),
 ('basketball', 2),
 ('tall', 1),
 ('brown', 1)]

In [12]:
total_docs = term_freq.count()
total_docs

4

In [13]:
# IDF (inverse document frequency) is the importance of each term in the corpus
# words that are in every document aren't useful; words that are rare are more interesting

# IDF(term) = log(number of docs / (1 + number of docs with term))

import math

idf = doc_freq.map(lambda tup: (tup[0], math.log(float(total_docs)/ (1 + tup[1])))).collect()
idf

[('quick', 0.28768207245178085),
 ('fence', 0.6931471805599453),
 ('players', 0.28768207245178085),
 ('jump', 0.6931471805599453),
 ('high', 0.6931471805599453),
 ('jumps', 0.6931471805599453),
 ('paints', 0.6931471805599453),
 ('basketball', 0.28768207245178085),
 ('tall', 0.6931471805599453),
 ('brown', 0.6931471805599453)]

In [14]:
broadcast_idf = sc.broadcast(idf)

In [15]:
# the TFIDF weight of a term t in a doc d is the product of the frequency of the term in the document,
# and the inverse document frequency (idf) of the term in the corpus

# tfidf(d, t) = tf(d, t) * idf(t)

def tfidf_vectorize(tokens):
    word_counts = Counter(tokens)
    doc_length = sum(word_counts.values())
    
    vector = [ word_counts.get(word[0], 0) * word[1] / float(doc_length) for word in broadcast_idf.value ]
    return np.array(vector)

In [16]:
# compute tfidf for all terms and all documents

tfidf = tokens.map(tfidf_vectorize)
tfidf.collect()

[array([ 0.07192052,  0.        ,  0.        ,  0.        ,  0.        ,
         0.1732868 ,  0.        ,  0.        ,  0.        ,  0.34657359]),
 array([ 0.        ,  0.23104906,  0.        ,  0.        ,  0.        ,
         0.        ,  0.23104906,  0.        ,  0.23104906,  0.        ]),
 array([ 0.        ,  0.        ,  0.14384104,  0.        ,  0.        ,
         0.        ,  0.        ,  0.14384104,  0.        ,  0.        ]),
 array([ 0.05753641,  0.        ,  0.05753641,  0.13862944,  0.13862944,
         0.        ,  0.        ,  0.05753641,  0.        ,  0.        ])]

In [17]:
### Now let's try to cluster these vectors

In [18]:
from pyspark.mllib.clustering import KMeans, KMeansModel
from math import sqrt

In [19]:
# two clusters
clusters = KMeans.train(tfidf, 2, maxIterations=10, initializationMode="random") #runs=10 was deprecated

In [20]:
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

In [21]:
WSSSE = tfidf.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

Within Set Sum of Squared Error = 0.6551087117005443


In [22]:
clusters.centers

[array([ 0.        ,  0.23104906,  0.        ,  0.        ,  0.        ,
         0.        ,  0.23104906,  0.        ,  0.23104906,  0.        ]),
 array([ 0.04315231,  0.        ,  0.06712582,  0.04620981,  0.04620981,
         0.05776227,  0.        ,  0.06712582,  0.        ,  0.11552453])]

In [23]:
# map the clusters back onto text
# do these clusters make sense?
top_n = 3
print([idf[idx][0] for idx in [np.argsort(x)[::-1][:top_n] for x in clusters.centers][0]])
print([idf[idx][0] for idx in [np.argsort(x)[::-1][:top_n] for x in clusters.centers][1]])

['tall', 'paints', 'fence']
['brown', 'basketball', 'players']


## Now repeat this with a real dataset

In [24]:
# read as dataframe 
!wget https://dsr-data.s3.amazonaws.com/enron/enron.json
email = sqlCtx.read.json('enron.json')

--2017-01-25 12:03:49--  https://dsr-data.s3.amazonaws.com/enron/enron.json
Resolving dsr-data.s3.amazonaws.com (dsr-data.s3.amazonaws.com)... 52.219.73.14
Connecting to dsr-data.s3.amazonaws.com (dsr-data.s3.amazonaws.com)|52.219.73.14|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15774921 (15M) [application/octet-stream]
Saving to: ‘enron.json.15’


2017-01-25 12:03:52 (6,02 MB/s) - ‘enron.json.15’ saved [15774921/15774921]



[Difference between Spark RDD's take(1) and first()](http://stackoverflow.com/questions/37495039/difference-between-spark-rdds-take1-and-first)

In [25]:
email.take(1)

[Row(_id=Row($oid='52af48b5d55148fa0c199643'), bcc=[], cc=[], ctype='text/plain; charset=us-ascii', date='2000-01-12 08:24:00-08:00', fname='1.', folder='_sent', fpath='enron_mail_20110402/maildir/lay-k/_sent/1.', mid='18133935.1075840283210.JavaMail.evans@thyme', recipients=['sherri.reinartz@enron.com'], replyto=None, sender='rosalee.fleming@enron.com', subject='Re: EXECUTIVE COMMITTEE MEETINGS - MONDAY, JANUARY 17', text='Ken will attend both meetings.\n\nRosie\n\n\n\nSherri Reinartz\n01/12/2000 03:30 PM\n\n\nTo: James M Bannantine/ENRON_DEVELOPMENT@ENRON_DEVELOPMENT, Cliff \nBaxter/HOU/ECT@ECT, Sanjay Bhatnagar/ENRON_DEVELOPMENT@ENRON_DEVELOPMENT, \nRick Buy/HOU/ECT@ECT, Richard Causey/Corp/Enron@ENRON, Diomedes \nChristodoulou/ENRON_DEVELOPMENT@ENRON_DEVELOPMENT, James V Derrick@Enron, \nAndrew S Fastow/HOU/ECT@ECT, Peggy Fowler/Enron@Gateway, Mark \nFrevert/LON/ECT@ECT, Kevin P Hannon/HOU/ECT@ECT, Ken Harrison/Enron@Gateway, \nDavid Haug/ENRON_DEVELOPMENT@ENRON_DEVELOPMENT, Joe Hi

.rdd returns the content as an pyspark.rdd of Row

In [26]:
# tokenize documents
# it would probably improve results if we included the subject line
tokenized_rdd = email.select('text').rdd \
  .map(lambda row: row.text) \
  .map(lambda text: text.replace('\n', ' ').replace('\r', ' ')) \
  .map(lambda text: tokenize(text)) \
  .filter(lambda text: len(text) > 0)

### Continue on your own

In [27]:
def tokenize(s):
    import re
    stopwords = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'])
    word_regex = '^[a-z][a-z\'-]+[a-z]$'
    s = s.lower()
    arr = s.split()
    terms = []
    for term in arr:
        if re.match(word_regex, term) != None and len(term) > 3 and term not in stopwords:
            terms.append(term)
    return terms

In [28]:

tokenized_rdd.count()

5866

In [29]:
tokens1 = sc.parallelize(tokenized_rdd.collect())
tokens1.collect()

[['attend',
  'rosie',
  'sherri',
  'reinartz',
  'james',
  'cliff',
  'sanjay',
  'rick',
  'richard',
  'diomedes',
  'james',
  'andrew',
  'peggy',
  'mark',
  'kevin',
  'david',
  'stanley',
  'kurt',
  'larry',
  'steven',
  'mark',
  'kenneth',
  'rebecca',
  'mike',
  'rebecca',
  'jeffrey',
  'mark',
  'cindy',
  'kenneth',
  'jeffrey',
  'john',
  'jeff',
  'joseph',
  'greg',
  'thomas',
  'brenda',
  'marcia',
  'susan',
  'stacy',
  'beena',
  'karen',
  'sharron',
  'molly',
  'rosane',
  'stephanie',
  'bridget',
  'shelby',
  'shelby',
  'mary',
  'nicki',
  'carol',
  'dolly',
  'elaine',
  'nancy',
  'cindy',
  'sherryl',
  'mary',
  'maureen',
  'joannie',
  'rosalee',
  'vanessa',
  'marsha',
  'cathy',
  'loretta',
  'dolores',
  'karen',
  'dorothy',
  'christina',
  'lauren',
  'sherri',
  'katherine',
  'judy',
  'bobbie',
  'rodney',
  'demonica',
  'vanessa',
  'suzanne',
  'keith',
  'executive',
  'committee',
  'meetings',
  'january',
  'please',
  'res

In [30]:
vocab1 = tokens1.flatMap(lambda words: words).distinct()
vocab1.collect()

['bessemer',
 'hi-tech',
 'tools',
 'subject',
 'factset',
 'beecher',
 'donates',
 'carney',
 'roberts',
 'exploiting',
 "shouldn't",
 'fared',
 'rechargeable',
 'boston-based',
 'sits',
 'often-times',
 'newsday',
 'distinction',
 'daugenti',
 'wavered',
 'siliconindia',
 "night's",
 'straw',
 'bergsieker',
 'petrochemicals',
 "containment's",
 'afford',
 'trinity',
 'competitor',
 'jamaica',
 'accommodation',
 'alright',
 "feinberg's",
 'segue',
 'widespread',
 'supercomputer',
 'trinkets',
 'enjoyable',
 'vitamin',
 'invest',
 'quarter',
 'saves',
 'thereby',
 'duval',
 'admiral',
 'glued',
 'floated',
 'president-research',
 'rain',
 'travesty',
 'addresss',
 'michael',
 'elephants-giraffes-hippos-rhinos-zebras',
 'machine',
 'ebms',
 'ceotexas',
 'zyprexa',
 'viduals',
 'christi',
 'store',
 'yandle',
 'hillen',
 'brabham',
 'attewnding',
 'rather',
 'interpersonal',
 'sailing',
 'all-new',
 'applauding',
 'triggering',
 'twelfth',
 'wilson',
 'hardware',
 'haven',
 'wings',
 'pr

In [None]:
from collections import Counter
import numpy as np

#sc.broadcast shares an immutable object throughout the cluster
broadcastVocab1 = sc.broadcast(vocab1.collect())

def bow_vectorize1(tokens1):
    word_counts = Counter(tokens1)
    vector = [word_counts[v] if v in word_counts else 0 for v in broadcastVocab1.value]
    return np.array(vector)

In [None]:
# create a Bag of Words representation of our document collection
#TODO: MORE EFFICIENT IS TO USE SPARSE REPRESENTATION
tokens1.map(bow_vectorize1).collect()


Traceback (most recent call last):
  File "/home/ramya/anaconda3/lib/python3.5/socketserver.py", line 313, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/home/ramya/anaconda3/lib/python3.5/socketserver.py", line 341, in process_request
    self.finish_request(request, client_address)
  File "/home/ramya/anaconda3/lib/python3.5/socketserver.py", line 354, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/home/ramya/anaconda3/lib/python3.5/socketserver.py", line 681, in __init__
    self.handle()
  File "/home/ramya/spark/python/pyspark/accumulators.py", line 235, in handle
    num_updates = read_int(self.rfile)
  File "/home/ramya/spark/python/pyspark/serializers.py", line 545, in read_int
    raise EOFError
EOFError
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:35616)
Traceback (most recent call last):
  File "/home/ramya/anaconda3/lib/python3.5/site-packages/IP

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 33284)
----------------------------------------


In [None]:
# TF is the number of occurences of each term in each document
term_freq = tokens.map(lambda terms: Counter(terms))
print(term_freq.collect())