# ISA 414 - Managing Big Data
## Lecture 23 – Spark (Part I)

### **Note: add %md on top of a cell in Databricks to create a markdown cell**

#### Auxiliary functions

In [None]:
from pyspark.sql.functions import regexp_replace, trim, col, lower

# removing punctuation; The resulting column is called 'sentence'
def removePunctuation(column):
    return trim(lower(regexp_replace(column, '([^\s\w_]|_)+', ''))).alias('sentence')  

#### Loading data

In [None]:
# change the filename here if necessary
fileName = "/FileStore/tables/mobydick.txt"

# after the read.text() function, we have a data frame with a single column called 'value'
mobyDickDF = sqlContext.read.text(fileName).select(removePunctuation(col('value')))

# print top 15 lines
mobyDickDF.show(15, truncate=False)

#### Retrieving Words from Lines

In [None]:
from pyspark.sql.functions import split, explode

# splitting words based on space; the resulting column is called 'split'
bookWordsSplitDF = (mobyDickDF.select(split(mobyDickDF.sentence, '\s+').alias('split')))

# explode() takes lists with many elements and create a row for each value 
wordsSingleDF = (bookWordsSplitDF.select(explode(bookWordsSplitDF.split).alias('word')))

# remove 'empty' words
bookWordsDF = wordsSingleDF.where(wordsSingleDF.word != '')

bookWordsDF.show(15)

#### Counting Words

In [None]:
# grouping  by words and counting
WordsAndCountsDF = bookWordsDF.groupBy('word').count()

# showing top words (descending order)
WordsAndCountsDF.orderBy("count", ascending=0).show()