### Import the required libraries then Create SparkContext

In [None]:
!pip install pyspark



In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.master ("local")\
.appName ("Colab")\
.config ('spark.ui.port', '4050')\
.getOrCreate ()

In [None]:
spark

In [None]:
sc=spark.sparkContext

### Create and display an RDD from the following list





In [None]:
data = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25), ('J-Hope', 25), ('Suga', 26), ('Jin', 27)]

In [None]:
rdd = sc.parallelize(data)

rdd

ParallelCollectionRDD[26] at readRDDFromFile at PythonRDD.scala:289

In [None]:
rdd.collect()

[('JK', 22),
 ('V', 24),
 ('Jimin', 24),
 ('RM', 25),
 ('J-Hope', 25),
 ('Suga', 26),
 ('Jin', 27)]

### Create a sample1.txt file to contain the text shown below.

In [None]:
print('''
Utilitatis causa amicitia est quaesita.
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Collatio igitur ista tenihil iuvat.
Honesta oratio, Socratica, Platonis etiam.
Primum in nostranepotestate est, quid meminerimus?
Duo Reges: constructio interrete.
Quid, sietiam iucunda memoria est praeteritorum malorum?
Si quidem, inquit, tollerem,''')


Utilitatis causa amicitia est quaesita.
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Collatio igitur ista tenihil iuvat.
Honesta oratio, Socratica, Platonis etiam.
Primum in nostranepotestate est, quid meminerimus?
Duo Reges: constructio interrete.
Quid, sietiam iucunda memoria est praeteritorum malorum?
Si quidem, inquit, tollerem,


In [None]:
%%writefile sample.txt
Utilitatis causa amicitia est quaesita.
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Collatio igitur ista tenihil iuvat.
Honesta oratio, Socratica, Platonis etiam.
Primum in nostranepotestate est, quid meminerimus?
Duo Reges: constructio interrete.
Quid, sietiam iucunda memoria est praeteritorum malorum?
Si quidem, inquit, tollerem,

In [None]:
data = ['''Utilitatis causa amicitia est quaesita.
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Collatio igitur ista tenihil iuvat.
Honesta oratio, Socratica, Platonis etiam.
Primum in nostranepotestate est, quid meminerimus?
Duo Reges: constructio interrete.
Quid, sietiam iucunda memoria est praeteritorum malorum?
Si quidem, inquit, tollerem,''']

rdd2 = sc.parallelize(data)

rdd2.saveAsTextFile("sample1.txt")

### Read sample1.txt file into RDD and displaying the first 4 elements

In [None]:
rdd2 = sc.textFile("sample1.txt")

In [None]:
rdd2.take(4)

['Utilitatis causa amicitia est quaesita.',
 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
 'Collatio igitur ista tenihil iuvat.',
 'Honesta oratio, Socratica, Platonis etiam.']

### Count the total number of rows in RDD

In [None]:
rdd2.count()

8

### Create a function to convert the data into lower case and splitting it

In [None]:
def preprocess_rdd(rdd):
    def process_line(line):
        # Convert to lowercase and split
        return line.lower().split()

    return rdd.map(process_line)

rdd3 = preprocess_rdd(rdd2)

rdd3.take(5)

[['utilitatis', 'causa', 'amicitia', 'est', 'quaesita.'],
 ['lorem',
  'ipsum',
  'dolor',
  'sit',
  'amet,',
  'consectetur',
  'adipiscing',
  'elit.'],
 ['collatio', 'igitur', 'ista', 'tenihil', 'iuvat.'],
 ['honesta', 'oratio,', 'socratica,', 'platonis', 'etiam.'],
 ['primum', 'in', 'nostranepotestate', 'est,', 'quid', 'meminerimus?']]

### Remove the stopwords from the previous text. i.e. Remove it.

In [None]:
stopwords = ['a','all','the','as','is','am','an','and',
             'be','been','from','had','I','I’d','why','with']

In [None]:
def removeStopwords(line):
    return [word for word in line.lower().split() if word not in stopwords]

rdd4 = rdd2.flatMap(removeStopwords)

In [None]:
rdd4.collect()

['utilitatis',
 'causa',
 'amicitia',
 'est',
 'quaesita.',
 'lorem',
 'ipsum',
 'dolor',
 'sit',
 'amet,',
 'consectetur',
 'adipiscing',
 'elit.',
 'collatio',
 'igitur',
 'ista',
 'tenihil',
 'iuvat.',
 'honesta',
 'oratio,',
 'socratica,',
 'platonis',
 'etiam.',
 'primum',
 'in',
 'nostranepotestate',
 'est,',
 'quid',
 'meminerimus?',
 'duo',
 'reges:',
 'constructio',
 'interrete.',
 'quid,',
 'sietiam',
 'iucunda',
 'memoria',
 'est',
 'praeteritorum',
 'malorum?',
 'si',
 'quidem,',
 'inquit,',
 'tollerem,']

### Find the words starting with ‘c’

In [None]:
rdd5 = rdd4.filter(lambda word: word.startswith('c'))

rdd5.take(4)

['causa', 'consectetur', 'collatio', 'constructio']

### Reduce the data by key and sum it (use the data from the following list)

In [None]:
data2 = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25)
        , ('J-Hope', 25), ('Suga', 26), ('Jin', 27)
       , ('J-Hope', 12), ('Suga', 25), ('Jin', 34)
       , ('JK', 32), ('V', 44), ('Jimin',14), ('RM', 35)]

In [None]:
rdd6 = sc.parallelize(data2)

reduced_rdd = rdd6.reduceByKey(lambda x, y: x + y)

result = reduced_rdd.collect()

print(result)

[('JK', 54), ('V', 68), ('Jimin', 38), ('RM', 60), ('J-Hope', 37), ('Suga', 51), ('Jin', 61)]


### Creat some key value pairs RDDs

In [None]:
rdd1 = sc.parallelize([('a',2),('b',3)])
rdd2 = sc.parallelize([('a',9),('b',7),('c',10)])

In [None]:
new_rdd1 = rdd1.keyBy(lambda x: (x[0],x[1]))
new_rdd2 = rdd2.keyBy(lambda x: x[1])

print(new_rdd1.collect())
print(new_rdd2.collect())

[(('a', 2), ('a', 2)), (('b', 3), ('b', 3))]
[(9, ('a', 9)), (7, ('b', 7)), (10, ('c', 10))]


### Perform Join operation on the RDDs (rdd1,rdd2)

In [None]:
joined_rdd = rdd1.join(rdd2)

print(joined_rdd.collect())

[('b', (3, 7)), ('a', (2, 9))]
