### Import the required libraries then Create SparkContext

In [None]:
import pyspark

In [None]:
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
sconf = SparkConf()

sconf.setMaster('local[*]').setAppName('RDD')

sc = SparkContext(master='local[*]',appName='RDD')
sc= SparkContext(conf=sconf)

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('RDD').master('local[*]').getOrCreate()

sc = sparkContext

### Create and display an RDD from the following list

In [None]:
ls = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25), ('J-Hope', 25), ('Suga', 26), ('Jin', 27)]

In [None]:
rdd = sc.parallelize(ls)

# Contents of RDD
print(rdd.collect())

[('JK', 22), ('V', 24), ('Jimin', 24), ('RM', 25), ('J-Hope', 25), ('Suga', 26), ('Jin', 27)]


### Create a sample1.txt file to contain the text shown below.

In [None]:
with open("sample1.txt", "w") as f:
    f.write('Utilitatis causa amicitia est quaesita.\n'
            'Lorem ipsum dolor sit amet, consectetur adipiscing elit.\n'
            'Collatio igitur ista tenihil iuvat.\n'
            'Honesta oratio, Socratica, Platonis etiam.\n'
            'Primum in nostranepotestate est, quid meminerimus?\n'
            'Duo Reges: constructio interrete.\n'
            'Quid, sietiam iucunda memoria est praeteritorum malorum?\n'
            'Si quidem, inquit, tollerem,')

In [None]:
print('''
Utilitatis causa amicitia est quaesita.
Lorem ipsum dolor sit amet, consectetur adipiscing elit. 
Collatio igitur ista tenihil iuvat. 
Honesta oratio, Socratica, Platonis etiam. 
Primum in nostranepotestate est, quid meminerimus? 
Duo Reges: constructio interrete.
Quid, sietiam iucunda memoria est praeteritorum malorum? 
Si quidem, inquit, tollerem,''')


Utilitatis causa amicitia est quaesita.
Lorem ipsum dolor sit amet, consectetur adipiscing elit. 
Collatio igitur ista tenihil iuvat. 
Honesta oratio, Socratica, Platonis etiam. 
Primum in nostranepotestate est, quid meminerimus? 
Duo Reges: constructio interrete.
Quid, sietiam iucunda memoria est praeteritorum malorum? 
Si quidem, inquit, tollerem,


### Read sample1.txt file into RDD and displaying the first 4 elements

In [None]:
rdd1 = sc.textFile("sample1.txt")
# Display the first 4 elements of the RDD
rdd1.take(4)

['Utilitatis causa amicitia est quaesita.',
 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
 'Collatio igitur ista tenihil iuvat.',
 'Honesta oratio, Socratica, Platonis etiam.']

### Count the total number of rows in RDD

In [None]:
# Count the total number of rows in the RDD
rdd1_count = rdd1.count()
rdd1_count

8

### Create a function to convert the data into lower case and splitting it

In [None]:
def process_data(data_rdd):
#     lowercase
    data_rdd = data_rdd.map(lambda x: x.lower())
#     split
    data_rdd = data_rdd.flatMap(lambda x: x.split())
    return data_rdd

In [None]:
# Apply the process_data() function to the RDD
processed_rdd = process_data(rdd1)
print(processed_rdd.collect())

['utilitatis', 'causa', 'amicitia', 'est', 'quaesita.', 'lorem', 'ipsum', 'dolor', 'sit', 'amet,', 'consectetur', 'adipiscing', 'elit.', 'collatio', 'igitur', 'ista', 'tenihil', 'iuvat.', 'honesta', 'oratio,', 'socratica,', 'platonis', 'etiam.', 'primum', 'in', 'nostranepotestate', 'est,', 'quid', 'meminerimus?', 'duo', 'reges:', 'constructio', 'interrete.', 'quid,', 'sietiam', 'iucunda', 'memoria', 'est', 'praeteritorum', 'malorum?', 'si', 'quidem,', 'inquit,', 'tollerem,']


### Remove the stopwords from the previous text. i.e. Remove it.

In [None]:
stopwords = ['a','all','the','as','is','am','an','and',
             'be','been','from','had','I','I’d','why','with']
# Hint: you may need use flatMap

In [None]:
def remove_stopwords(data_rdd):
    # Remove the stopwords from the data
    data_rdd = data_rdd.filter(lambda x: x not in stopwords)
    return data_rdd

my_rdd = sc.textFile("sample1.txt")

# Process Data
my_processed_rdd = process_data(my_rdd)

# Remove the stopwords from processed data
my_filtered_rdd = remove_stopwords(my_processed_rdd)

# Display the first 10 words in the filtered RDD
first_10_words = my_filtered_rdd.take(10)
print(first_10_words)

['utilitatis', 'causa', 'amicitia', 'est', 'quaesita.', 'lorem', 'ipsum', 'dolor', 'sit', 'amet,']


### Find the words starting with ‘c’

In [None]:
my_rdd = sc.textFile("sample1.txt")

my_processed_rdd = process_data(my_rdd)
my_filtered_rdd = remove_stopwords(my_processed_rdd)

# Words starting with 'c'
my_c_words_rdd = my_filtered_rdd.filter(lambda x: x.startswith('c'))

print(my_c_words_rdd.collect())

['causa', 'consectetur', 'collatio', 'constructio']


### Reduce the data by key and sum it (use the data from the following list)

In [None]:
list = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25)
        , ('J-Hope', 25), ('Suga', 26), ('Jin', 27)
       , ('J-Hope', 12), ('Suga', 25), ('Jin', 34)
       , ('JK', 32), ('V', 44), ('Jimin',14), ('RM', 35)]
# Hint: use reduceByKey

In [None]:
my_rdd = sc.parallelize(list)

my_sum_rdd = my_rdd.reduceByKey(lambda x, y: x + y)

print(my_sum_rdd.collect())

[('JK', 54), ('J-Hope', 37), ('Suga', 51), ('V', 68), ('RM', 60), ('Jin', 61), ('Jimin', 38)]


### Creat some key value pairs RDDs

In [None]:
rdd1 = sc.parallelize([('a',2),('b',3)])
rdd2 = sc.parallelize([('a',9),('b',7),('c',10)])

In [None]:
print(rdd1.collect())

[('a', 2), ('b', 3)]


In [None]:
print(rdd2.collect())

[('a', 9), ('b', 7), ('c', 10)]


### Perform Join operation on the RDDs (rdd1,rdd2)

In [None]:
# Perform a join operation on the RDDs
rdd_join = rdd1.join(rdd2)

print(rdd_join.collect())