In [0]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .appName("Map and FlatMap")\
        .getOrCreate()

#### Operations over RDD

##### Create RDD

In [0]:
data = ["Project Gutenberg’s","Alice’s Adventures",
"in Wonderland","Project Gutenberg’s Adventures",
"in Wonderland","Project Gutenberg’s"]

In [0]:
rdd=spark.sparkContext.parallelize(data)

#### Map()

In [0]:
rdd2=rdd.map(lambda x : (x,x.upper()))
for element in rdd2.collect():
    print(element)    

('Project Gutenberg’s', 'PROJECT GUTENBERG’S')
('Alice’s Adventures', 'ALICE’S ADVENTURES')
('in Wonderland', 'IN WONDERLAND')
('Project Gutenberg’s Adventures', 'PROJECT GUTENBERG’S ADVENTURES')
('in Wonderland', 'IN WONDERLAND')
('Project Gutenberg’s', 'PROJECT GUTENBERG’S')


In [0]:
rdd2=rdd.map(lambda x:x.upper())
for element in rdd2.collect():
    print(element)

PROJECT GUTENBERG’S
ALICE’S ADVENTURES
IN WONDERLAND
PROJECT GUTENBERG’S ADVENTURES
IN WONDERLAND
PROJECT GUTENBERG’S


#### flatmap()

In [0]:
rdd2=rdd.flatMap(lambda x:x.split(" "))
for element in rdd2.collect():
    print(element)

Project
Gutenberg’s
Alice’s
Adventures
in
Wonderland
Project
Gutenberg’s
Adventures
in
Wonderland
Project
Gutenberg’s


#### Operations over DataFrame

In [0]:
data = [
    ('James', 'Smith', 'M', 30),
    ('Anna', 'Jones', 'F', 41),
    ('Robert', 'Williams', 'M', 60)
]

columns = ["First_name", "Last_name", "gender", "Salary"]

df = spark.createDataFrame(data, columns)
df.printSchema()

root
 |-- First_name: string (nullable = true)
 |-- Last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- Salary: long (nullable = true)



##### map()

In [0]:
rdd2=df.rdd.map(lambda x:
    (x[0]+' '+x[1],x[2],x[3]*2))
df2=rdd2.toDF(["full_name","gender","salary"])
df2.show(truncate=False)

+---------------+------+------+
|full_name      |gender|salary|
+---------------+------+------+
|James Smith    |M     |60    |
|Anna Jones     |F     |82    |
|Robert Williams|M     |120   |
+---------------+------+------+



In [0]:
rdd3=df.rdd2

In [0]:
for x in rdd3.collect():
    print(x["First_name"])

James
Anna
Robert


In [0]:
def funct(x):
    first_name=x.First_name
    last_name=x.Last_name
    name=first_name+last_name
    gender=x.gender.lower()
    sal=x.Salary * 2
    return (name,gender,sal)

In [0]:
rdd2=df.rdd.map(lambda x : funct(x))

In [0]:
df2=rdd2.toDF(["full_name","gender","sal"])
df2.show()

+--------------+------+---+
|     full_name|gender|sal|
+--------------+------+---+
|    JamesSmith|     m| 60|
|     AnnaJones|     f| 82|
|RobertWilliams|     m|120|
+--------------+------+---+



In [0]:
rdd2.collect()

Out[17]: [('JamesSmith', 'm', 60), ('AnnaJones', 'f', 82), ('RobertWilliams', 'm', 120)]

##### flatmap()

In [0]:
arrayData = [
        ('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
        ('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
        ('Robert',['CSharp',''],{'hair':'red','eye':''}),
        ('Washington',None,None),
        ('Jefferson',['1','2'],{})]
df = spark.createDataFrame(data=arrayData, schema = ['name','knownLanguages','properties'])

In [0]:
from pyspark.sql.functions import explode
df2=df.select(df.name,explode(df.properties))
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- knownLanguages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [0]:
df.show(truncate=False)
df2.show()

+----------+-------------------+-----------------------------+
|name      |knownLanguages     |properties                   |
+----------+-------------------+-----------------------------+
|James     |[Java, Scala]      |{eye -> brown, hair -> black}|
|Michael   |[Spark, Java, null]|{eye -> null, hair -> brown} |
|Robert    |[CSharp, ]         |{eye -> , hair -> red}       |
|Washington|null               |null                         |
|Jefferson |[1, 2]             |{}                           |
+----------+-------------------+-----------------------------+

+-------+----+-----+
|   name| key|value|
+-------+----+-----+
|  James| eye|brown|
|  James|hair|black|
|Michael| eye| null|
|Michael|hair|brown|
| Robert| eye|     |
| Robert|hair|  red|
+-------+----+-----+

