In [None]:
!apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
!tar -xvf spark-2.4.0-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.0-bin-hadoop2.7"
import findspark
findspark.init()

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd '/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_1'

Mounted at /content/gdrive
/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_1


In [3]:
import pyspark
from pyspark import SparkContext
sc = SparkContext(master="local", appName="New Spark Context")

In [23]:
rdd1 = sc.parallelize(['Spark', 'is', 'a', 'framework', 'for', 'Big Data processing'])

print(type(rdd1))
print(rdd1.getNumPartitions())

<class 'pyspark.rdd.RDD'>
1


In [22]:
rdd2 = sc.textFile('Resources_day_2/5000_points.txt', minPartitions=3)
print(type(rdd2))
print(rdd2.take(5))
print(rdd2.getNumPartitions())

<class 'pyspark.rdd.RDD'>
['664159\t550946', '665845\t557965', '597173\t575538', '618600\t551446', '635690\t608046']
3


In [24]:
rdd3 = sc.textFile('hdfs://bigdata.laptrinhpython.net:19000/t8.shakespeare.txt')
print(type(rdd3))
print(rdd3.take(5))
print(rdd3.getNumPartitions())

<class 'pyspark.rdd.RDD'>
['This is the 100th Etext file presented by Project Gutenberg, and', 'is presented in cooperation with World Library, Inc., from their', 'Library of the Future and Shakespeare CDROMS.  Project Gutenberg', 'often releases Etexts that are NOT placed in the Public Domain!!', '']
1


In [26]:
for rdd in rdd3.take(6):
    print(rdd)

This is the 100th Etext file presented by Project Gutenberg, and
is presented in cooperation with World Library, Inc., from their
Library of the Future and Shakespeare CDROMS.  Project Gutenberg
often releases Etexts that are NOT placed in the Public Domain!!

Shakespeare


In [27]:
rdd4 = sc.parallelize([1,2,3,4,5,6,7,8,9])
rdd4_map = rdd4.map(lambda x: x*x)
rdd4_map.collect()

[1, 4, 9, 16, 25, 36, 49, 64, 81]

In [29]:
rdd4_filter = rdd4.filter(lambda x: x>3)
rdd4_filter.take(3)

[4, 5, 6]

In [30]:
rdd5 = sc.parallelize(['Data Science', 'Machine Learning', 'Big Data'])
rdd5_flatmap = rdd5.flatMap(lambda x: x.split())
rdd5_flatmap.collect()

['Data', 'Science', 'Machine', 'Learning', 'Big', 'Data']

In [31]:
rdd5_map = rdd5.map(lambda x: x.split())
rdd5_map.collect()

[['Data', 'Science'], ['Machine', 'Learning'], ['Big', 'Data']]

In [32]:
rdd5 = sc.parallelize([1, 2, 4, 4, 5])
rdd6 = sc.parallelize([1, 2, 3, 4, 5, 7])
rdd5_6_union = rdd5.union(rdd6)
rdd5_6_union.collect()

[1, 2, 4, 4, 5, 1, 2, 3, 4, 5, 7]

In [54]:
rdd5.intersection(rdd6).reduce(lambda x, y: x + y)

12

In [36]:
rdd5.intersection(rdd6).collect()

[2, 4, 1, 5]

In [56]:
number_rdd = sc.parallelize(range(100))
number_rdd.getNumPartitions()

1

In [58]:
number_rdd.saveAsTextFile('number')

In [59]:
!ls

Day_1_demo.ipynb  demo.ipynb  number	       Resources_day_2
Day_2_demo.ipynb  Docs	      Resources_day_1


In [60]:
my_tuple = [('SV001', 'Tran Van An'), ('SV002', 'Nguyen Van Anh'), ('SV003', 'Le Thi Cuc')]
pairRDD_tuple = sc.parallelize(my_tuple)
pairRDD_tuple.collect()

[('SV001', 'Tran Van An'),
 ('SV002', 'Nguyen Van Anh'),
 ('SV003', 'Le Thi Cuc')]

In [67]:
rdd = sc.parallelize([('SV01', 8), ('SV02', 10), ('SV01', 10), ('SV02', 9), ('SV03', 10)])
rdd_reduced = rdd.reduceByKey(lambda x, y: x+y)
for num in rdd_reduced.collect():
    print('{} co diem tong la {}'.format(num[0], num[1]))

SV01 co diem tong la 18
SV02 co diem tong la 19
SV03 co diem tong la 10


In [68]:
rdd_sorted = rdd_reduced.sortBy(lambda x: x[1], ascending=True)
for num in rdd_sorted.collect():
    print('{} co diem tong la {}'.format(num[0], num[1]))

SV03 co diem tong la 10
SV01 co diem tong la 18
SV02 co diem tong la 19


In [72]:
rdd_group = rdd.groupByKey()
for masv, marks in rdd_group.collect():
    print(masv, list(marks))

SV01 [8, 10]
SV02 [10, 9]
SV03 [10]


In [75]:
math = sc.parallelize([('SV01', 8), ('SV02', 7), ('SV03', 9)])
english = sc.parallelize([('SV01', 2), ('SV02', 4)])
rdd = math.join(english)
rdd.collect()

[('SV01', (8, 2)), ('SV02', (7, 4))]

In [77]:
rdd = sc.parallelize([('SV01', 8), ('SV02', 7), ('SV03', 9), ('SV01', 2), ('SV02', 4)])
for key, val in rdd.countByKey().items():
    print(key, val)

SV01 2
SV02 2
SV03 1


In [78]:
rdd.collectAsMap()

{'SV01': 2, 'SV02': 4, 'SV03': 9}