<a href="https://colab.research.google.com/github/Melvinmcrn/DataScience/blob/master/PySpark/2_Pyspark_Basic_RDD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

# Pyspark_Basic_RDD

In [0]:
#1 - import module
from pyspark import SparkContext

In [5]:
#2 - Create SparkContext
sc = SparkContext.getOrCreate()
sc

In [6]:
import multiprocessing

multiprocessing.cpu_count()

2

In [0]:
#rdd.getNumPartitions

In [0]:
#3 - Print top 5 rows
def printRDD(data,num):
    for line in data.take(num):
        print(line)

In [9]:
#4 - Read file to spark RDD
rdd = sc.textFile("iris.csv")
rdd.cache()

# Attribute Information:
# 1. sepal length in cm 
# 2. sepal width in cm 
# 3. petal length in cm 
# 4. petal width in cm 
# 5. class: 
# -- Iris Setosa 
# -- Iris Versicolour 
# -- Iris Virginica


printRDD(rdd,5)

5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa


In [10]:
#5 - map
mapped_rdd = rdd.map(lambda line : line.split(","))

printRDD(mapped_rdd,5)

[u'5.1', u'3.5', u'1.4', u'0.2', u'Iris-setosa']
[u'4.9', u'3.0', u'1.4', u'0.2', u'Iris-setosa']
[u'4.7', u'3.2', u'1.3', u'0.2', u'Iris-setosa']
[u'4.6', u'3.1', u'1.5', u'0.2', u'Iris-setosa']
[u'5.0', u'3.6', u'1.4', u'0.2', u'Iris-setosa']


In [11]:
#6 - flatMap
flatMaped_rdd = rdd.flatMap(lambda line : line.split(","))

printRDD(flatMaped_rdd,25)

5.1
3.5
1.4
0.2
Iris-setosa
4.9
3.0
1.4
0.2
Iris-setosa
4.7
3.2
1.3
0.2
Iris-setosa
4.6
3.1
1.5
0.2
Iris-setosa
5.0
3.6
1.4
0.2
Iris-setosa


In [12]:
#7 - create unique id
zipedWithUniqueId_rdd = rdd.zipWithUniqueId()

print("zipedWithUniqueId_rdd count : " + str(zipedWithUniqueId_rdd.count()))
printRDD(zipedWithUniqueId_rdd,5)

zipedWithUniqueId_rdd count : 150
(u'5.1,3.5,1.4,0.2,Iris-setosa', 0)
(u'4.9,3.0,1.4,0.2,Iris-setosa', 2)
(u'4.7,3.2,1.3,0.2,Iris-setosa', 4)
(u'4.6,3.1,1.5,0.2,Iris-setosa', 6)
(u'5.0,3.6,1.4,0.2,Iris-setosa', 8)


In [13]:
#8 - sample data
sampled_rdd = zipedWithUniqueId_rdd.sample(withReplacement=False, fraction=0.5, seed=50)

print("rdd count : " + str(zipedWithUniqueId_rdd.count()))
print("sampled_rdd count : " + str(sampled_rdd.count()))
printRDD(sampled_rdd,5)

rdd count : 150
sampled_rdd count : 64
(u'5.1,3.5,1.4,0.2,Iris-setosa', 0)
(u'4.6,3.1,1.5,0.2,Iris-setosa', 6)
(u'5.0,3.6,1.4,0.2,Iris-setosa', 8)
(u'4.6,3.4,1.4,0.3,Iris-setosa', 12)
(u'5.4,3.9,1.3,0.4,Iris-setosa', 32)


In [14]:
#9 - union and intersect
sampled1_rdd = zipedWithUniqueId_rdd.sample(withReplacement=False, fraction=0.5, seed=25)
sampled2_rdd = zipedWithUniqueId_rdd.sample(withReplacement=False, fraction=0.5, seed=50)
unioned_rdd = sampled1_rdd.union(sampled2_rdd)
intersected_rdd = sampled1_rdd.intersection(sampled2_rdd)

print("sampled1_rdd count : " + str(sampled1_rdd.count()))
print("sampled2_rdd count : " + str(sampled2_rdd.count()))
print("unioned_rdd count : " + str(unioned_rdd.count()))
print("intersected_rdd count : " + str(intersected_rdd.count()))

sampled1_rdd count : 78
sampled2_rdd count : 64
unioned_rdd count : 142
intersected_rdd count : 36


In [15]:
#10 - distinct
label_rdd = mapped_rdd.map(lambda line : line[-1])
printRDD(label_rdd,5)

print("\n")

label_list = label_rdd.distinct().collect()
print(label_list)

Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa


[u'Iris-virginica', u'Iris-setosa', u'Iris-versicolor']


In [16]:
#11 - zip 2 rdd together
feature_rdd = mapped_rdd.map(lambda line : line[0:-1])
printRDD(feature_rdd,5)

print("\n")

zip_rdd = feature_rdd.zip(label_rdd)
printRDD(zip_rdd,5)

print("\n")

zip_rdd = zip_rdd.map(lambda (features,label) : features + [label])
printRDD(zip_rdd,5)

[u'5.1', u'3.5', u'1.4', u'0.2']
[u'4.9', u'3.0', u'1.4', u'0.2']
[u'4.7', u'3.2', u'1.3', u'0.2']
[u'4.6', u'3.1', u'1.5', u'0.2']
[u'5.0', u'3.6', u'1.4', u'0.2']


([u'5.1', u'3.5', u'1.4', u'0.2'], u'Iris-setosa')
([u'4.9', u'3.0', u'1.4', u'0.2'], u'Iris-setosa')
([u'4.7', u'3.2', u'1.3', u'0.2'], u'Iris-setosa')
([u'4.6', u'3.1', u'1.5', u'0.2'], u'Iris-setosa')
([u'5.0', u'3.6', u'1.4', u'0.2'], u'Iris-setosa')


[u'5.1', u'3.5', u'1.4', u'0.2', u'Iris-setosa']
[u'4.9', u'3.0', u'1.4', u'0.2', u'Iris-setosa']
[u'4.7', u'3.2', u'1.3', u'0.2', u'Iris-setosa']
[u'4.6', u'3.1', u'1.5', u'0.2', u'Iris-setosa']
[u'5.0', u'3.6', u'1.4', u'0.2', u'Iris-setosa']


In [17]:
#12 - collect
data_list = rdd.collect()
#Too many result => not a good method when deal with big data
print("data_list size : " + str(len(data_list)))
for data in data_list:
    print(data)

data_list size : 150
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa
4.6,3.4,1.4,0.3,Iris-setosa
5.0,3.4,1.5,0.2,Iris-setosa
4.4,2.9,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.4,3.7,1.5,0.2,Iris-setosa
4.8,3.4,1.6,0.2,Iris-setosa
4.8,3.0,1.4,0.1,Iris-setosa
4.3,3.0,1.1,0.1,Iris-setosa
5.8,4.0,1.2,0.2,Iris-setosa
5.7,4.4,1.5,0.4,Iris-setosa
5.4,3.9,1.3,0.4,Iris-setosa
5.1,3.5,1.4,0.3,Iris-setosa
5.7,3.8,1.7,0.3,Iris-setosa
5.1,3.8,1.5,0.3,Iris-setosa
5.4,3.4,1.7,0.2,Iris-setosa
5.1,3.7,1.5,0.4,Iris-setosa
4.6,3.6,1.0,0.2,Iris-setosa
5.1,3.3,1.7,0.5,Iris-setosa
4.8,3.4,1.9,0.2,Iris-setosa
5.0,3.0,1.6,0.2,Iris-setosa
5.0,3.4,1.6,0.4,Iris-setosa
5.2,3.5,1.5,0.2,Iris-setosa
5.2,3.4,1.4,0.2,Iris-setosa
4.7,3.2,1.6,0.2,Iris-setosa
4.8,3.1,1.6,0.2,Iris-setosa
5.4,3.4,1.5,0.4,Iris-setosa
5.2,4.1,1.5,0.1,Iris-setosa
5.5,4.2,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa

In [18]:
#13 - take
data_list = rdd.take(5)
#Select first n rows
print("data_list size : " + str(len(data_list)))
for data in data_list:
    print(data)

data_list size : 5
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa


In [19]:
#14 - top
data_list = rdd.top(5)
#Select top n rows
print("data_list size : " + str(len(data_list)))
for data in data_list:
    print(data)

data_list size : 5
7.9,3.8,6.4,2.0,Iris-virginica
7.7,3.8,6.7,2.2,Iris-virginica
7.7,3.0,6.1,2.3,Iris-virginica
7.7,2.8,6.7,2.0,Iris-virginica
7.7,2.6,6.9,2.3,Iris-virginica


In [20]:
#15 - countByKey
label_features_rdd = (mapped_rdd
                      .map(lambda line : (line[-1],line[0:-1]))
                      .map(lambda (label,features) : (label,[float(feature) for feature in features]))
                     )
printRDD(label_features_rdd,5)

print("\n")

label_count_dict = label_features_rdd.map(lambda (label,features) : (label,1)).countByKey()
print(label_count_dict)

(u'Iris-setosa', [5.1, 3.5, 1.4, 0.2])
(u'Iris-setosa', [4.9, 3.0, 1.4, 0.2])
(u'Iris-setosa', [4.7, 3.2, 1.3, 0.2])
(u'Iris-setosa', [4.6, 3.1, 1.5, 0.2])
(u'Iris-setosa', [5.0, 3.6, 1.4, 0.2])


defaultdict(<type 'int'>, {u'Iris-virginica': 50, u'Iris-setosa': 50, u'Iris-versicolor': 50})


In [21]:
#16 - find sum and then divide with count => average
label_avg_features_rdd = (label_features_rdd
                         .reduceByKey(lambda features1,features2 : [sum(x) for x in zip(features1, features2)])
                         .map(lambda (label,sum_features) : (label,[feature/label_count_dict[label] for feature in sum_features]))
                        )

printRDD(label_avg_features_rdd,5)

(u'Iris-virginica', [6.587999999999998, 2.9739999999999998, 5.552, 2.026])
(u'Iris-setosa', [5.005999999999999, 3.4180000000000006, 1.464, 0.2439999999999999])
(u'Iris-versicolor', [5.936, 2.77, 4.26, 1.3260000000000003])


In [22]:
#17 - filter
iris_virginica_rdd = label_avg_features_rdd.filter(lambda (label,avg_features) : label == "Iris-virginica")
printRDD(iris_virginica_rdd,5)

(u'Iris-virginica', [6.587999999999998, 2.9739999999999998, 5.552, 2.026])
