### Import the required libraries then Create SparkContext

In [1]:
#Install PySpark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=0496eeb885f8c08f9dd203ff6e6c1493eda58bc21934dd467ccaf8ffedc7587f
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
# Set up Java (PySpark requires Java)
!apt-get install openjdk-8-jdk-headless -qq > /dev/null


In [4]:
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("Colab").setMaster("local[*]")
sc = SparkContext.getOrCreate(conf=conf)


### Create and display an RDD from the following list

In [None]:
list = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25), ('J-Hope', 25), ('Suga', 26), ('Jin', 27)]

In [7]:
data = [('JK', 22), ('V', 24), ('Jimin', 24), ('RM', 25), ('J-Hope', 25), ('Suga', 26), ('Jin', 27)]
rdd = sc.parallelize(data)
rdd.collect()


[('JK', 22),
 ('V', 24),
 ('Jimin', 24),
 ('RM', 25),
 ('J-Hope', 25),
 ('Suga', 26),
 ('Jin', 27)]

### Create a sample1.txt file to contain the text shown below.

In [None]:
print('''
Utilitatis causa amicitia est quaesita.
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Collatio igitur ista tenihil iuvat.
Honesta oratio, Socratica, Platonis etiam.
Primum in nostranepotestate est, quid meminerimus?
Duo Reges: constructio interrete.
Quid, sietiam iucunda memoria est praeteritorum malorum?
Si quidem, inquit, tollerem,''')


Utilitatis causa amicitia est quaesita.
Lorem ipsum dolor sit amet, consectetur adipiscing elit. 
Collatio igitur ista tenihil iuvat. 
Honesta oratio, Socratica, Platonis etiam. 
Primum in nostranepotestate est, quid meminerimus? 
Duo Reges: constructio interrete.
Quid, sietiam iucunda memoria est praeteritorum malorum? 
Si quidem, inquit, tollerem,


In [13]:
with open('sample1.txt', 'w') as f:
    f.write('Utilitatis causa amicitia est quaesita.\n'
            'Lorem ipsum dolor sit amet, consectetur adipiscing elit.\n'
            'Collatio igitur ista tenihil iuvat.\n'
            'Honesta oratio, Socratica, Platonis etiam.\n'
            'Primum in nostranepotestate est, quid meminerimus?\n'
            'Duo Reges: constructio interrete.\n'
            'Quid, sietiam iucunda memoria est praeteritorum malorum?\n'
            'Si quidem, inquit, tollerem,')

### Read sample1.txt file into RDD and displaying the first 4 elements

In [15]:
text_rdd = sc.textFile('sample1.txt')
text_rdd.take(4)

['Utilitatis causa amicitia est quaesita.',
 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
 'Collatio igitur ista tenihil iuvat.',
 'Honesta oratio, Socratica, Platonis etiam.']

### Count the total number of rows in RDD

In [16]:
print(text_rdd.count())

8


### Create a function to convert the data into lower case and splitting it

In [17]:
def to_lower_split(x):
    return x.lower().split()

lower_split_rdd = text_rdd.flatMap(to_lower_split)
lower_split_rdd.collect()

['utilitatis',
 'causa',
 'amicitia',
 'est',
 'quaesita.',
 'lorem',
 'ipsum',
 'dolor',
 'sit',
 'amet,',
 'consectetur',
 'adipiscing',
 'elit.',
 'collatio',
 'igitur',
 'ista',
 'tenihil',
 'iuvat.',
 'honesta',
 'oratio,',
 'socratica,',
 'platonis',
 'etiam.',
 'primum',
 'in',
 'nostranepotestate',
 'est,',
 'quid',
 'meminerimus?',
 'duo',
 'reges:',
 'constructio',
 'interrete.',
 'quid,',
 'sietiam',
 'iucunda',
 'memoria',
 'est',
 'praeteritorum',
 'malorum?',
 'si',
 'quidem,',
 'inquit,',
 'tollerem,']

### Remove the stopwords from the previous text. i.e. Remove it.

In [18]:
stopwords = ['a','all','the','as','is','am','an','and',
             'be','been','from','had','I','I’d','why','with']
# Hint: you may need use flatMap

In [19]:
filtered_rdd = lower_split_rdd.filter(lambda x: x not in stopwords)
filtered_rdd.collect()

['utilitatis',
 'causa',
 'amicitia',
 'est',
 'quaesita.',
 'lorem',
 'ipsum',
 'dolor',
 'sit',
 'amet,',
 'consectetur',
 'adipiscing',
 'elit.',
 'collatio',
 'igitur',
 'ista',
 'tenihil',
 'iuvat.',
 'honesta',
 'oratio,',
 'socratica,',
 'platonis',
 'etiam.',
 'primum',
 'in',
 'nostranepotestate',
 'est,',
 'quid',
 'meminerimus?',
 'duo',
 'reges:',
 'constructio',
 'interrete.',
 'quid,',
 'sietiam',
 'iucunda',
 'memoria',
 'est',
 'praeteritorum',
 'malorum?',
 'si',
 'quidem,',
 'inquit,',
 'tollerem,']

### Find the words starting with ‘c’

In [20]:
c_words_rdd = filtered_rdd.filter(lambda x: x.startswith('c'))
c_words_rdd.collect()

['causa', 'consectetur', 'collatio', 'constructio']

### Reduce the data by key and sum it (use the data from the following list)

In [21]:
list = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25)
        , ('J-Hope', 25), ('Suga', 26), ('Jin', 27)
       , ('J-Hope', 12), ('Suga', 25), ('Jin', 34)
       , ('JK', 32), ('V', 44), ('Jimin',14), ('RM', 35)]
# Hint: use reduceByKey

In [22]:
rdd = sc.parallelize(list)

In [23]:
reduced_rdd = rdd.reduceByKey(lambda a, b: a + b)
reduced_rdd.collect()

[('Suga', 51),
 ('Jin', 61),
 ('JK', 54),
 ('V', 68),
 ('Jimin', 38),
 ('RM', 60),
 ('J-Hope', 37)]

### Creat some key value pairs RDDs

In [27]:
rdd1 = sc.parallelize([('a',2),('b',3)])
rdd2 = sc.parallelize([('a',9),('b',7),('c',10)])

In [28]:
rdd1.collect()

[('a', 2), ('b', 3)]

In [29]:
rdd2.collect()

[('a', 9), ('b', 7), ('c', 10)]

### Perform Join operation on the RDDs (rdd1,rdd2)

In [30]:
joined_rdd = rdd1.join(rdd2)

In [31]:
joined_rdd.collect()

[('b', (3, 7)), ('a', (2, 9))]