For help, look here:
https://spark.apache.org/docs/latest/rdd-programming-guide.html

In [0]:
# Check out the pre-loaded dataset
display(dbutils.fs.ls('dbfs:/databricks-datasets/'))

path,name,size,modificationTime
dbfs:/databricks-datasets/COVID/,COVID/,0,0
dbfs:/databricks-datasets/README.md,README.md,976,1532468253000
dbfs:/databricks-datasets/Rdatasets/,Rdatasets/,0,0
dbfs:/databricks-datasets/SPARK_README.md,SPARK_README.md,3359,1455043490000
dbfs:/databricks-datasets/adult/,adult/,0,0
dbfs:/databricks-datasets/airlines/,airlines/,0,0
dbfs:/databricks-datasets/amazon/,amazon/,0,0
dbfs:/databricks-datasets/asa/,asa/,0,0
dbfs:/databricks-datasets/atlas_higgs/,atlas_higgs/,0,0
dbfs:/databricks-datasets/bikeSharing/,bikeSharing/,0,0


## 1. Word Count

In [0]:
# Create a rdd (sc = SparkContext)
rdd = spark.read.text("dbfs:/databricks-datasets/SPARK_README.md")

In [0]:
# Read 20 lines 
rdd.take(20)

Out[3]: [Row(value='# Apache Spark'),
 Row(value=''),
 Row(value='Spark is a fast and general cluster computing system for Big Data. It provides'),
 Row(value='high-level APIs in Scala, Java, Python, and R, and an optimized engine that'),
 Row(value='supports general computation graphs for data analysis. It also supports a'),
 Row(value='rich set of higher-level tools including Spark SQL for SQL and DataFrames,'),
 Row(value='MLlib for machine learning, GraphX for graph processing,'),
 Row(value='and Spark Streaming for stream processing.'),
 Row(value=''),
 Row(value='<http://spark.apache.org/>'),
 Row(value=''),
 Row(value=''),
 Row(value='## Online Documentation'),
 Row(value=''),
 Row(value='You can find the latest Spark documentation, including a programming'),
 Row(value='guide, on the [project web page](http://spark.apache.org/documentation.html)'),
 Row(value='and [project wiki](https://cwiki.apache.org/confluence/display/SPARK).'),
 Row(value='This README file only contains ba

In [0]:
%python
words = rdd.selectExpr("explode(split(value, ' ')) as word")

for w in words.collect():
  print(w)

Row(word='#')
Row(word='Apache')
Row(word='Spark')
Row(word='')
Row(word='Spark')
Row(word='is')
Row(word='a')
Row(word='fast')
Row(word='and')
Row(word='general')
Row(word='cluster')
Row(word='computing')
Row(word='system')
Row(word='for')
Row(word='Big')
Row(word='Data.')
Row(word='It')
Row(word='provides')
Row(word='high-level')
Row(word='APIs')
Row(word='in')
Row(word='Scala,')
Row(word='Java,')
Row(word='Python,')
Row(word='and')
Row(word='R,')
Row(word='and')
Row(word='an')
Row(word='optimized')
Row(word='engine')
Row(word='that')
Row(word='supports')
Row(word='general')
Row(word='computation')
Row(word='graphs')
Row(word='for')
Row(word='data')
Row(word='analysis.')
Row(word='It')
Row(word='also')
Row(word='supports')
Row(word='a')
Row(word='rich')
Row(word='set')
Row(word='of')
Row(word='higher-level')
Row(word='tools')
Row(word='including')
Row(word='Spark')
Row(word='SQL')
Row(word='for')
Row(word='SQL')
Row(word='and')
Row(word='DataFrames,')
Row(word='MLlib')
Row(word='for'

In [0]:
from pyspark.sql.functions import split, explode, col

words_df = rdd.select(explode(split(col("value"), " ")).alias("word"))

for w in words_df.collect():
    print(w['word'])

#
Apache
Spark

Spark
is
a
fast
and
general
cluster
computing
system
for
Big
Data.
It
provides
high-level
APIs
in
Scala,
Java,
Python,
and
R,
and
an
optimized
engine
that
supports
general
computation
graphs
for
data
analysis.
It
also
supports
a
rich
set
of
higher-level
tools
including
Spark
SQL
for
SQL
and
DataFrames,
MLlib
for
machine
learning,
GraphX
for
graph
processing,
and
Spark
Streaming
for
stream
processing.

<http://spark.apache.org/>


##
Online
Documentation

You
can
find
the
latest
Spark
documentation,
including
a
programming
guide,
on
the
[project
web
page](http://spark.apache.org/documentation.html)
and
[project
wiki](https://cwiki.apache.org/confluence/display/SPARK).
This
README
file
only
contains
basic
setup
instructions.

##
Building
Spark

Spark
is
built
using
[Apache
Maven](http://maven.apache.org/).
To
build
Spark
and
its
example
programs,
run:





build/mvn
-DskipTests
clean
package

(You
do
not
need
to
do
this
if
you
downloaded
a
pre-built
package.)
More
detaile

In [0]:
# 2. change all capital letters to lower case
from pyspark.sql.functions import explode, split, col, lower
words_df = rdd.select(split(lower(col("value")), ' ').alias("word"))

for w in words_df.collect():
    print(w['word'])

['#', 'apache', 'spark']
['']
['spark', 'is', 'a', 'fast', 'and', 'general', 'cluster', 'computing', 'system', 'for', 'big', 'data.', 'it', 'provides']
['high-level', 'apis', 'in', 'scala,', 'java,', 'python,', 'and', 'r,', 'and', 'an', 'optimized', 'engine', 'that']
['supports', 'general', 'computation', 'graphs', 'for', 'data', 'analysis.', 'it', 'also', 'supports', 'a']
['rich', 'set', 'of', 'higher-level', 'tools', 'including', 'spark', 'sql', 'for', 'sql', 'and', 'dataframes,']
['mllib', 'for', 'machine', 'learning,', 'graphx', 'for', 'graph', 'processing,']
['and', 'spark', 'streaming', 'for', 'stream', 'processing.']
['']
['<http://spark.apache.org/>']
['']
['']
['##', 'online', 'documentation']
['']
['you', 'can', 'find', 'the', 'latest', 'spark', 'documentation,', 'including', 'a', 'programming']
['guide,', 'on', 'the', '[project', 'web', 'page](http://spark.apache.org/documentation.html)']
['and', '[project', 'wiki](https://cwiki.apache.org/confluence/display/spark).']
['this

In [0]:
# 3. eliminate stopwords 
words_df = rdd.select(explode(split(lower(col("value")), ' ')).alias("word"))

# 3. eliminate stopwords
stop_words = ['and', 'to', 'in', 'at', 'the', 'an']
filtered_words_df = words_df.filter(~col("word").isin(stop_words))

for w in filtered_words_df.collect():
    print(w['word'])
stop_words = ['and', 'to', 'in', 'at', 'the', 'an']

#
apache
spark

spark
is
a
fast
general
cluster
computing
system
for
big
data.
it
provides
high-level
apis
scala,
java,
python,
r,
optimized
engine
that
supports
general
computation
graphs
for
data
analysis.
it
also
supports
a
rich
set
of
higher-level
tools
including
spark
sql
for
sql
dataframes,
mllib
for
machine
learning,
graphx
for
graph
processing,
spark
streaming
for
stream
processing.

<http://spark.apache.org/>


##
online
documentation

you
can
find
latest
spark
documentation,
including
a
programming
guide,
on
[project
web
page](http://spark.apache.org/documentation.html)
[project
wiki](https://cwiki.apache.org/confluence/display/spark).
this
readme
file
only
contains
basic
setup
instructions.

##
building
spark

spark
is
built
using
[apache
maven](http://maven.apache.org/).
build
spark
its
example
programs,
run:





build/mvn
-dskiptests
clean
package

(you
do
not
need
do
this
if
you
downloaded
a
pre-built
package.)
more
detailed
documentation
is
available
from
project
site,


In [0]:
# 4. sort in alphabetical order
sorted_words_df = filtered_words_df.orderBy("word")

for w in sorted_words_df.collect():
    print(w['word'])




































































"local"
"local[n]"
"yarn"
#
##
##
##
##
##
##
##
##
(you
-dskiptests
./bin/pyspark
./bin/run-example
./bin/run-example
./bin/spark-shell
./dev/run-tests
1000).count()
1000:
1000:
<class>
<http://spark.apache.org/>
>>>
["building
["specifying
[apache
[building
[configuration
[params]`.
[project
[project
[run
`./bin/run-example
`examples`
`examples`
a
a
a
a
a
a
a
a
a
abbreviated
about
against
also
also
also
also
alternatively,
analysis.
apache
apis
are
available
basic
be
be
because
big
build
build
build
build/mvn
building
building
building
built
built,
can
can
can
can
can
can
changed
class
class
clean
cluster
cluster
cluster.
comes
command,
command,
computation
computing
configuration
configure
contains
core
data
data.
dataframes,
detailed
detailed
different
directory.
distribution
distributions.
do
do
documentation
documentation
documentation
documentation
documentation,
downloaded
easiest
engine
environment
example
exam

In [0]:
# 5. sort from most to least frequent word
word_counts_df = filtered_words_df.groupBy("word").count()

# Sort from most to least frequent word
sorted_word_counts_df = word_counts_df.orderBy(("count"))

for w in sorted_word_counts_df.collect():
    print(w['word'], w['count'])

graphs 1
pi 1
abbreviated 1
overview 1
rich 1
url, 1
name 1
stream 1
run: 1
not 1
guide](http://spark.apache.org/docs/latest/configuration.html) 1
./dev/run-tests 1
will 1
[run 1
because 1
must 1
master=spark://host:7077 1
variable 1
core 1
graphx 1
more 1
[configuration 1
protocols 1
java, 1
site, 1
systems. 1
[building 1
configure 1
alternatively, 1
system 1
provides 1
pre-built 1
directory. 1
apis 1
data. 1
wiki](https://cwiki.apache.org/confluence/display/spark). 1
library 1
contains 1
programming 1
downloaded 1
1000).count() 1
comes 1
machine 1
params 1
n 1
given. 1
same 1
page](http://spark.apache.org/documentation.html) 1
using: 1
fast 1
streaming 1
your 1
optimized 1
graph 1
package 1
master 1
project 1
other 1
learning, 1
when 1
submit 1
version"](http://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version) 1
scala> 1
print 1
testing 1
different 1
spark. 1
configuration 1
apache 1
data 1
-dskiptests 1
processing. 1
maven](http://maven.apache.org/). 1


In [0]:
from pyspark.sql.functions import regexp_replace, col

# 6.** remove punctuations 
cleaned_words_df = filtered_words_df.withColumn("word", regexp_replace(col("word"), "[^\w\s]", ""))

for w in cleaned_words_df.collect():
    print(w['word'])


apache
spark

spark
is
a
fast
general
cluster
computing
system
for
big
data
it
provides
highlevel
apis
scala
java
python
r
optimized
engine
that
supports
general
computation
graphs
for
data
analysis
it
also
supports
a
rich
set
of
higherlevel
tools
including
spark
sql
for
sql
dataframes
mllib
for
machine
learning
graphx
for
graph
processing
spark
streaming
for
stream
processing

httpsparkapacheorg



online
documentation

you
can
find
latest
spark
documentation
including
a
programming
guide
on
project
web
pagehttpsparkapacheorgdocumentationhtml
project
wikihttpscwikiapacheorgconfluencedisplayspark
this
readme
file
only
contains
basic
setup
instructions


building
spark

spark
is
built
using
apache
mavenhttpmavenapacheorg
build
spark
its
example
programs
run





buildmvn
dskiptests
clean
package

you
do
not
need
do
this
if
you
downloaded
a
prebuilt
package
more
detailed
documentation
is
available
from
project
site
building
sparkhttpsparkapacheorgdocslatestbuildingsparkhtml


interactiv

## 2. What does it do?

In [0]:
%python
# Create a DataFrame from the data
data = [("Denny", 31), ("Jules", 30), ("TD", 35), ("Brooke", 25)]
columns = ["name", "age"]
dataDF = spark.createDataFrame(data, columns)

# Calculate the average age
from pyspark.sql import functions as F

agesDF = dataDF.groupBy("name").agg(F.avg("age").alias("average_age"))

display(agesDF)

name,average_age
Denny,31.0
Jules,30.0
TD,35.0
Brooke,25.0
