# HackOnData.com
### Instructions
Perform the following operations to the text:

   - Remove punctuation, convert text to lower case, and strip leading and trailing spaces. Then:
     - Find the top 10 most popular words
     - Find the top 10 most popular characters
   - In addition to removing punctuation (see previous question) remove words that are included in the list -- http://tacit.usc.edu/resources/stopwords_eng.txt --. Then:
     - Find the top 10 most popular words after removing the words above
     - Find the words and counts for string starting with "dra". Does it give you an idea of other sorts of text preprocessing that you can perform?
   - Find the most common words used before the word "dragon". Example: in the phrase "the red dragon", "red" is the word before "dragon". Make sure to exclude the list of words of the previous question.

In [2]:
CONNECTION = (
  '*',                                         # AWS_KEY
  '*',                                         # AWS_PWD
  '*',                                         # BUCKET
)
RESOURCES = (
  'Dragons+and+Dragon+Lore.txt',               # RAW_CONTENT_FILENAME
  'stopwords_eng.txt',                         # STOPWORDS
)

PATTERN = 'dra'
REGEX = '[\w]+(?=\sdragon)'

WATERFALL_LIMIT = 10
raw_text, raw_stopwords = (sc.textFile('s3n://%s:%s@%s/%s' % (CONNECTION + (r,)), minPartitions=4) 
                                       for r in RESOURCES)

In [3]:
import operator
import re
import string

# Declarations
punctuation = string.punctuation # !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ 

def _char_count(tpl):
  """
  Input: <tuple of a (word, it's frequency in text)
  Algo:
    Count chars in a word, 
    then multiply char count by frequency of the word
  """
  d = {}
  word, frequency = tpl
  for c in word:
    d[c] = d.get(c, 0) + 1
  return map(
    lambda k_v: (k_v[0], k_v[1] * frequency),
    d.items())

def _stats(data, *args):
  """
  Not reusable. 
  Requires both args and data elements to be length of 3 
  (string formatting restriction)
  """
  return '\n'.join((
    '|\t%s\t|\t%s\t|\t%s\t|' % args, 
    '|%s|' % ('-'*55),
    '\n'.join(map(
        lambda tpl: '|\t%s\t\t%s\t\t%s\t\t|' % (tpl[0]+1, tpl[1][0],tpl[1][1]), 
        enumerate(data)))))

def pattern_search(line):                                    
  matches = re.findall(REGEX, line, flags=re.IGNORECASE)
  if matches:
      return tuple((m, 1) for m in matches)
  return ()

In [4]:
orig_text = (raw_text
        .map(lambda l: ''.join(c for c in l if c not in punctuation))  # drop punctuation
        .map(lambda l: l.strip() or None)                              # remove whitespaces
        .filter(None)                                                  # remove empty lines        
        .map(lambda l: l.lower()))                                     # lowercase

split_by_words = orig_text.flatMap(lambda l: l.split())                # split by whitespace -- RDD will be reused

_reducer = lambda rdd: (rdd                                            # -- TRANSFORMATION will be reused
  .map(lambda w: (w, 1))                                               # create tuples
  .reduceByKey(operator.add))                                          # count frequencies

word_tuples = _reducer(split_by_words)

words_by_frequency = word_tuples.sortBy(lambda tpl: tpl[1], False)
characters_by_frequency = (words_by_frequency
  .flatMap(_char_count)
  .reduceByKey(operator.add)
  .sortBy(lambda tpl: tpl[1], False))

In [5]:

print '| Most frequent words %s' % ('-'*35)
print _stats(                                                         # 1. most frequent words
  words_by_frequency.take(WATERFALL_LIMIT), 
  'POS', 'WORD', 'FREQUENCY')


print '\n| Most frequent characters: %s' % ('-'*29)
print _stats(                                                         # 2. most frequent characters
  characters_by_frequency.take(WATERFALL_LIMIT), 
  'POS', 'CHAR', 'FREQUENCY')

In [6]:
cleaned_subset = split_by_words.subtract(raw_stopwords)
cleaned_subset_by_frequency = _reducer(cleaned_subset).sortBy(lambda tpl: tpl[1], False)

print '| Most frequent words * NO STOPWORDS * %s' % ('-'*18)
print _stats(                                                         # 3. most frequent words without stopwords
  cleaned_subset_by_frequency.take(WATERFALL_LIMIT), 
  'POS', 'WORD', 'FREQUENCY')

In [7]:
dras = cleaned_subset_by_frequency.filter(lambda tpl: tpl[0].startswith(PATTERN))
display(dras.toDF(schema=['Word', 'Frequency']))                      # 4. words starting with PATTERN (by frequency)

In [8]:
stops = set(raw_stopwords.collect())
punctuation_minus_stop = ''.join(set(string.punctuation) - set(['.',]))
c_text = (raw_text
        .map(lambda l: ''.join(c for c in l if c not in punctuation_minus_stop))  # drop all punctuation except fulll stops
        .map(lambda l: l.lower())                                      
        .flatMap(lambda l: l.split('.'))
        .map(lambda l: l.strip() or None)                              # remove whitespaces
        .filter(None)                                                  # remove empty lines        
        .map(lambda l: ' '.join(w for w in l.split() if w not in stops)))

word_before_dragons = (c_text                                         #5 Find mot frequent words before "dragon"
    .flatMap(pattern_search)
    .filter(None)
    .reduceByKey(operator.add)
    .sortBy(lambda tpl: tpl[1], False))

display(word_before_dragons.toDF(schema=['Word', 'Frequency']))