<a href="https://colab.research.google.com/github/Hyenni/BDAI-Training/blob/master/sparkLAB2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Process Data FIles with Apache Spark

In [0]:
# Stub code to copy into Spark Shell

import xml.etree.ElementTree as ElementTree

# Optional: Set logging level to WARN to reduce distracting info messages
sc.setLogLevel("WARN")  

# Given a string containing XML, parse the string, and 
# return an iterator of activation XML records (Elements) contained in the string

def getActivations(s):
    filetree = ElementTree.fromstring(s)
    return filetree.getiterator('activation')
    
# Given an activation record (XML Element), return the model name
def getModel(activation):
    return activation.find('model').text 

# Given an activation record (XML Element), return the account number 
def getAccount(activation):
    return activation.find('account-number').text 


In [0]:
# Read XML files into an RDD 
files="/loudacre/activations"
activationFiles = sc.wholeTextFiles(files)

# Parse each file (as a string) into a collection of activation XML records
activationRecords = activationFiles.flatMap(lambda (filename,xmlstring): getActivations(xmlstring))


### return activation object 

In [0]:
activationFiles.take(2)
activationRecords.take(2)

[<Element 'activation' at 0x7f197819b210>,
 <Element 'activation' at 0x7f1978185510>]

In [0]:
# Map each activation record to "account-number:model-name"
models = activationRecords.map(lambda record: getAccount(record) + ":" + getModel(record))

In [0]:
models.saveAsTextFile("/loudacre/account-models")

In [0]:
models.take(2)

['9763:MeeToo 1.0', '426:Titanic 1000']

In [0]:
models2 = activationRecords.map(lambda record: (getAccount(record),getModel(record)))

In [0]:
for (account, model) in models2.take(5):
   print account + ":" + model 

9763:MeeToo 1.0
426:Titanic 1000
383:Sorrento F00L
404:MeeToo 1.0
393:iFruit 1


# Bonus

In [0]:
# Optional: Set logging level to WARN to reduce distracting info messages
sc.setLogLevel("WARN")  

# Load the data file
devstatus = sc.textFile("/loudacre/devicestatus.txt")


In [0]:
devstatus.take(3)

[u'2014-03-15:10:10:20,Sorrento F41L,8cc3b47e-bd01-4482-b500-28f2342679af,7,24,39,enabled,disabled,connected,55,67,12,33.6894754264,-117.543308253',
 u'2014-03-15:10:10:20|MeeToo 1.0|ef8c7564-0a1a-4650-a655-c8bbd5f8f943|0|31|63|70|39|27|enabled|enabled|enabled|37.4321088904|-121.485029632',
 u'2014-03-15:10:10:20|MeeToo 1.0|23eba027-b95a-4729-9a4b-a3cca51c5548|0|20|21|86|54|34|enabled|enabled|enabled|39.4378908349|-120.938978486']

In [0]:
# Filter out lines with < 20 characters, use the 20th character as the delimiter, parse the line, and filter out bad lines
cleanstatus = devstatus. \
    filter(lambda line: len(line) > 20). \
    map(lambda line: line.split(line[19:20])). \
    filter(lambda values: len(values) == 14)
    
# Create a new RDD containing date, manufacturer, device ID, latitude and longitude
devicedata = cleanstatus. \
    map(lambda values: (values[0], values[1].split(' ')[0], values[2], values[12], values[13]))

devicedata.take(2)

[(u'2014-03-15:10:10:20',
  u'Sorrento',
  u'8cc3b47e-bd01-4482-b500-28f2342679af',
  u'33.6894754264',
  u'-117.543308253'),
 (u'2014-03-15:10:10:20',
  u'MeeToo',
  u'ef8c7564-0a1a-4650-a655-c8bbd5f8f943',
  u'37.4321088904',
  u'-121.485029632')]

In [0]:
# Save to a CSV file as a comma-delimited string (trim parenthesis from tuple toString)
devicedata. \
    map(lambda values: ','.join(values)). \
    saveAsTextFile("/loudacre/devicestatus_etl")
