In [1]:
import pyspark
from pyspark.sql import SparkSession


In [2]:

# create a SparkSession
spark = SparkSession.builder \
            .appName("MyApp") \
            .master("local[*]") \
            .getOrCreate()


In [3]:
sc = spark.sparkContext

In [4]:
# from pyspark import SparkContext
# sc = SparkContext()

In [5]:
data = list(range(9))

In [6]:
data


[0, 1, 2, 3, 4, 5, 6, 7, 8]

In [7]:
rdd1 = sc.parallelize(data)

In [8]:
rdd1

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:287

In [9]:
rdd1.getNumPartitions()

4

In [39]:
rdd1.persist()

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:287

In [10]:
rdd1.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8]

In [13]:
def sq(x):
    return x**2

In [14]:
rdd2 = rdd1.map(sq)

In [16]:
rdd2.collect()

[0, 1, 4, 9, 16, 25, 36, 49, 64]

In [17]:
rdd2.take(3)

[0, 1, 4]

In [19]:
rdd2.count()

9

In [40]:
new_data = list(range(100))
rdd_new = sc.parallelize(new_data)

In [42]:
rdd1 = rdd_new.map(lambda x:x+10)

In [43]:
rdd2 = rdd1.map(lambda x : x*2 )

In [44]:
rdd3 = rdd1.map(lambda x:x**2)

In [46]:
rdd4 = rdd1.map(lambda x:x/2)

In [47]:
rdd1.persist()

PythonRDD[15] at RDD at PythonRDD.scala:53

In [49]:
rdd2.take(10)

[20, 22, 24, 26, 28, 30, 32, 34, 36, 38]

In [29]:
%%writefile example.txt
first line
second line
third line
fourth line

Overwriting example.txt


In [30]:
rdd_text = sc.textFile('example.txt')

In [31]:
rdd_text

example.txt MapPartitionsRDD[10] at textFile at NativeMethodAccessorImpl.java:0

In [32]:
rdd_text.first()

'first line'

In [34]:
rdd_text.take(3)

['first line', 'second line', 'third line']

In [50]:
rdd_text2 = rdd_text.map(lambda x :x.split())

In [51]:
rdd_text2

PythonRDD[18] at RDD at PythonRDD.scala:53

In [53]:
rdd_text2.collect()

[['first', 'line'], ['second', 'line'], ['third', 'line'], ['fourth', 'line']]

## Narrow Transformation VS wide Transformation 

## Narrow transformation are those where each input partition will contribute to only one output partition. 

## Wide transformation â€” specify wide dependencies, Wide transformation will have input partitions contributing to many output partitions.
## You will often hear this referred to as a shuffle, where Spark will exchange partitions across the cluster

In [57]:
%%writefile example2.txt
first
second line 
the third line 
then a fourth line 


Writing example2.txt


In [60]:
rdd1 = sc.textFile('example2.txt')

In [61]:
rdd1.collect()

['first', 'second line ', 'the third line ', 'then a fourth line ']

In [62]:
rdd2 = rdd1.map(lambda l : l.split())

In [63]:
rdd2.collect()

[['first'],
 ['second', 'line'],
 ['the', 'third', 'line'],
 ['then', 'a', 'fourth', 'line']]

In [65]:
rdd3 = rdd1.flatMap(lambda l : l.split())

In [66]:
rdd3.collect()

['first',
 'second',
 'line',
 'the',
 'third',
 'line',
 'then',
 'a',
 'fourth',
 'line']

In [68]:
rdd = sc.parallelize(list(range(15)))

In [69]:
rdd2=rdd.filter(lambda x : x > 5)

In [70]:
rdd2.collect()

[6, 7, 8, 9, 10, 11, 12, 13, 14]

In [71]:
## WIDE OPERATION 

rdd = sc.parallelize(['Omar', 'MAF', 'Ahmed', 'Amira', 'Mohamed', 'Omnia'])

In [72]:
rdd.collect()

['Omar', 'MAF', 'Ahmed', 'Amira', 'Mohamed', 'Omnia']

In [73]:
rdd2 = rdd.groupBy(lambda x : x[0])

In [75]:
rdd2.collect()

[('O', <pyspark.resultiterable.ResultIterable at 0x7fad0f82f4f0>),
 ('M', <pyspark.resultiterable.ResultIterable at 0x7fad0f82e6e0>),
 ('A', <pyspark.resultiterable.ResultIterable at 0x7fad0f82ee00>)]

In [76]:
lst = rdd2.collect()

In [77]:
lst[0]

('O', <pyspark.resultiterable.ResultIterable at 0x7fad0fb9fd00>)

In [79]:
[(a,list(b)) for (a,b) in lst]

[('O', ['Omar', 'Omnia']),
 ('M', ['MAF', 'Mohamed']),
 ('A', ['Ahmed', 'Amira'])]

In [80]:
l = [('A', 10), ('B', 15), ('A', 20) , ('B', 30)]

In [82]:
rdd = sc.parallelize(l)

In [83]:
rdd.collect()

[('A', 10), ('B', 15), ('A', 20), ('B', 30)]

In [84]:
rdd2 = rdd.groupByKey()

In [87]:
ll = rdd2.collect()

In [88]:
[(a,list(b)) for (a,b) in ll]

[('A', [10, 20]), ('B', [15, 30])]