In [1]:
import pyspark


In [2]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("SimpleApp") \
    .master("local[*]") \
    .getOrCreate()

# Get the SparkContext from the SparkSession
sc = spark.sparkContext

In [3]:
rdd = sc.parallelize([1,2,3,4,5,6,7])

In [4]:
rdd2 = rdd.map(lambda x :x **3)

In [5]:
rdd2.take(3)

[1, 8, 27]

In [6]:
%%writefile exmaple.txt
first line
second line
third line
fourth line


Overwriting exmaple.txt


In [33]:
rdd_file = sc.textFile('exmaple.txt')

In [34]:
rdd_file.count()

4

In [35]:
rdd_file.collect()

['first line', 'second line', 'third line', 'fourth line']

In [36]:
rdd_file_2 = rdd_file.filter(lambda line : 'second' in line)

In [37]:
rdd_file_2.collect()

['second line']

In [38]:
rdd = sc.parallelize([1,2,3,4,5,6,7])

In [39]:
rdd2 = rdd.map(lambda x : x**2)

In [40]:
rdd2.collect()

[1, 4, 9, 16, 25, 36, 49]

In [41]:
rdd3 = rdd2.map(lambda x : x**3)

In [42]:
rdd4 = rdd2.reduce(lambda a,b : a +b)

In [43]:
rdd4

140

In [44]:
rdd5 = rdd2.filter(lambda x : x > 5)

In [45]:
rdd5.collect()

[9, 16, 25, 36, 49]

In [46]:
new_data = list(range(100))
rdd_new = sc.parallelize(new_data)

In [47]:
rdd1 = rdd_new.map(lambda x:x**2)

In [48]:
rdd3 = rdd1.map(lambda x:x*2)

In [49]:
rdd4 = rdd1.map(lambda x:x/2)

In [50]:
rdd4.persist()

PythonRDD[23] at RDD at PythonRDD.scala:53

In [51]:
rdd4.is_cached

True

In [52]:
rdd4.take(5)

[0.0, 0.5, 2.0, 4.5, 8.0]

In [53]:
rdd_txt= rdd_file.map(lambda x :x.split())

In [54]:
rdd_txt.collect()

[['first', 'line'], ['second', 'line'], ['third', 'line'], ['fourth', 'line']]

In [56]:
rdd_txt2= rdd_file.flatMap(lambda x :x.split())

In [57]:
rdd_txt2.collect()

['first', 'line', 'second', 'line', 'third', 'line', 'fourth', 'line']

In [58]:
rdd_file.collect()

['first line', 'second line', 'third line', 'fourth line']

In [70]:
rdd = sc.parallelize(['MAF', 'Ghamry', 'Mody', 'Sw','Dans' ,'Wensh', 'Boshra', 'Kosomk ya Kareem ya 3arbi'])

In [71]:
rdd.collect()

['MAF',
 'Ghamry',
 'Mody',
 'Sw',
 'Dans',
 'Wensh',
 'Boshra',
 'Kosomk ya Kareem ya 3arbi']

In [72]:
rdd2 = rdd.groupBy(lambda x : x[0])

In [73]:
lst = rdd2.collect()

In [80]:
list(lst[5][1])

['Kosomk ya Kareem ya 3arbi']

In [82]:
[(a,list(b)) for a,b in lst]

[('B', ['Boshra']),
 ('G', ['Ghamry']),
 ('S', ['Sw']),
 ('D', ['Dans']),
 ('M', ['MAF', 'Mody']),
 ('K', ['Kosomk ya Kareem ya 3arbi']),
 ('W', ['Wensh'])]

In [89]:
l = [('A',10), ('B',10), ('B',90), ('S',50), ('A',30)]

In [90]:
rdd= sc.parallelize(l)

In [91]:
rdd2 = rdd.groupByKey()

In [95]:
ll = rdd2.collect()

In [97]:
[(a,list(b)) for a,b in ll]

[('B', [10, 90]), ('S', [50]), ('A', [10, 30])]

## Spark DataFrames

In [98]:
dataRDD = sc.parallelize([("Brooke", 20), ("Denny", 31), ("Jules", 30),
("TD", 35), ("Brooke", 25),("Jules", 40), ("Denny", 51)])

In [99]:
rdd2 = dataRDD.groupByKey()

In [101]:
list1=rdd2.collect()

In [102]:
[(a,list(b)) for a,b in list1]

[('Brooke', [20, 25]), ('Denny', [31, 51]), ('TD', [35]), ('Jules', [30, 40])]

In [118]:
rdd3 = rdd2.map(lambda x : (x[0], sum(x[1]) / len(x[1])))

In [119]:
rdd3.collect()

[('Brooke', 22.5), ('Denny', 41.0), ('TD', 35.0), ('Jules', 35.0)]

#### Here above describing that doing any transformation on the RDD is Pain, here we tried to calculate the avg for each Key, we did some operation on the data itself, but it would be painful if the transformation is huge or the size of the data is big (having many columns).

In [120]:
rdd_avg = dataRDD.groupByKey().map(lambda x : (x[0], sum(x[1]) / len(x[1])))

In [121]:
rdd_avg.collect()

[('Brooke', 22.5), ('Denny', 41.0), ('TD', 35.0), ('Jules', 35.0)]

In [122]:
from pyspark.sql.functions import avg
import pyspark.sql.functions as fn

In [123]:
data= [("Brooke", 20), ("Denny", 31), ("Jules", 30),
("TD", 35), ("Brooke", 25),("Jules", 40), ("Denny", 51)]

In [124]:
df = spark.createDataFrame(data, ['name', 'age'])

In [125]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [126]:
df.show()

+------+---+
|  name|age|
+------+---+
|Brooke| 20|
| Denny| 31|
| Jules| 30|
|    TD| 35|
|Brooke| 25|
| Jules| 40|
| Denny| 51|
+------+---+



In [130]:
df.collect()[0].age

20

In [136]:
df2 = df.groupBy('name').avg('age').alias('avg_age')

In [137]:
df2.show()

+------+--------+
|  name|avg(age)|
+------+--------+
|Brooke|    22.5|
| Denny|    41.0|
| Jules|    35.0|
|    TD|    35.0|
+------+--------+



In [141]:
df3 = df.groupBy('name').agg(fn.avg('age').alias('avg_age'))

In [142]:
df3.show()

+------+-------+
|  name|avg_age|
+------+-------+
|Brooke|   22.5|
| Denny|   41.0|
| Jules|   35.0|
|    TD|   35.0|
+------+-------+



In [144]:
df4 = spark.read.json('people.json')

In [145]:
df4.show()

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [146]:
df4.age

Column<'age'>

In [147]:
df4.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [148]:
fn.col('Ahmed')

Column<'Ahmed'>

In [149]:
df4.show()

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [150]:
from pyspark.sql.functions import * 


In [156]:
df4[['name','age']].where(df4['age'] > 10).show()

+------+---+
|  name|age|
+------+---+
|  Andy| 30|
|Justin| 19|
+------+---+



In [158]:
df5 = df4.select('name','age', (col('age') * 2).alias('Double Age'))

In [159]:
df5.show()

+-------+----+----------+
|   name| age|Double Age|
+-------+----+----------+
|Michael|NULL|      NULL|
|   Andy|  30|        60|
| Justin|  19|        38|
+-------+----+----------+



In [161]:
df4.select('name','age', expr('age * 2').alias('Double Age')).show()

+-------+----+----------+
|   name| age|Double Age|
+-------+----+----------+
|Michael|NULL|      NULL|
|   Andy|  30|        60|
| Justin|  19|        38|
+-------+----+----------+



In [165]:
df5 = spark.createDataFrame([
    ['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
    ['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
    ['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=['color', 'fruit', 'v1', 'v2'])
df5.show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
+-----+------+---+---+



In [178]:
df5.groupBy('color').agg(avg('v1').alias('avg_v1'),avg('v2').alias('avg_v2')).show()

+-----+------+------+
|color|avg_v1|avg_v2|
+-----+------+------+
|  red|   4.8|  48.0|
| blue|   3.0|  30.0|
|black|   6.0|  60.0|
+-----+------+------+



In [180]:
df5.groupBy('color','fruit').avg().show()

+-----+------+-------+-------+
|color| fruit|avg(v1)|avg(v2)|
+-----+------+-------+-------+
|  red|banana|    4.0|   40.0|
| blue|banana|    2.0|   20.0|
|  red|carrot|    4.0|   40.0|
| blue| grape|    4.0|   40.0|
|black|carrot|    6.0|   60.0|
|  red| grape|    8.0|   80.0|
+-----+------+-------+-------+



In [181]:
df5.groupBy('color','fruit').avg().sort('color').show()

+-----+------+-------+-------+
|color| fruit|avg(v1)|avg(v2)|
+-----+------+-------+-------+
|black|carrot|    6.0|   60.0|
| blue|banana|    2.0|   20.0|
| blue| grape|    4.0|   40.0|
|  red|banana|    4.0|   40.0|
|  red|carrot|    4.0|   40.0|
|  red| grape|    8.0|   80.0|
+-----+------+-------+-------+



In [189]:
from pyspark.sql.types import StructType, StructField
myschema = StructType([StructField('NewColor',StringType(), False),
                     StructField('NewFruit', StringType(),False),
                     StructField('Prop1', IntegerType()),
                     StructField('Prop2', IntegerType(), True)])

In [190]:
df = spark.createDataFrame([
    ['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
    ['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
    ['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=myschema)
df.show()

+--------+--------+-----+-----+
|NewColor|NewFruit|Prop1|Prop2|
+--------+--------+-----+-----+
|     red|  banana|    1|   10|
|    blue|  banana|    2|   20|
|     red|  carrot|    3|   30|
|    blue|   grape|    4|   40|
|     red|  carrot|    5|   50|
|   black|  carrot|    6|   60|
|     red|  banana|    7|   70|
|     red|   grape|    8|   80|
+--------+--------+-----+-----+



In [191]:
df.printSchema()

root
 |-- NewColor: string (nullable = false)
 |-- NewFruit: string (nullable = false)
 |-- Prop1: integer (nullable = true)
 |-- Prop2: integer (nullable = true)



In [192]:
## to do the schema like SQL 
sql_schema = 'NewColor String, NewFruit String, Prop1 integer, Prop2 Integer'
df_sql = spark.createDataFrame([
    ['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
    ['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
    ['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=sql_schema)
df_sql.show()


+--------+--------+-----+-----+
|NewColor|NewFruit|Prop1|Prop2|
+--------+--------+-----+-----+
|     red|  banana|    1|   10|
|    blue|  banana|    2|   20|
|     red|  carrot|    3|   30|
|    blue|   grape|    4|   40|
|     red|  carrot|    5|   50|
|   black|  carrot|    6|   60|
|     red|  banana|    7|   70|
|     red|   grape|    8|   80|
+--------+--------+-----+-----+



In [194]:
df_sql.printSchema()

root
 |-- NewColor: string (nullable = true)
 |-- NewFruit: string (nullable = true)
 |-- Prop1: integer (nullable = true)
 |-- Prop2: integer (nullable = true)

