### Import the required libraries then Create SparkContext

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=84075236904219919de5d055c2104b225e33f082ad586ed79f79cc4eb2021b82
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [4]:
from pyspark.sql import SparkSession
# Initialize a SparkSession
spark = SparkSession.builder.appName("H_W_DM_RDD").getOrCreate()

### Create and display an RDD from the following list

In [5]:
list = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25), ('J-Hope', 25), ('Suga', 26), ('Jin', 27)]

In [6]:
rdd = spark.sparkContext.parallelize(list)


### Create a sample1.txt file to contain the text shown below.

In [16]:
# Define the text content
text_content = '''Utilitatis causa amicitia est quaesita.
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Collatio igitur ista tenihil iuvat.
Honesta oratio, Socratica, Platonis etiam.
Primum in nostra nepotestate est, quid meminerimus?
Duo Reges: constructio interrete.
Quid, si etiam iucunda memoria est praeteritorum malorum?
Si quidem, inquit, tollerem,'''

# Create or overwrite the sample1.txt file and write the text to it
with open('sample1.txt', 'w') as file:
    file.write(text_content)

print("sample1.txt has been created with the provided text.")


sample1.txt has been created with the provided text.


In [None]:
print('''
Utilitatis causa amicitia est quaesita.
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Collatio igitur ista tenihil iuvat.
Honesta oratio, Socratica, Platonis etiam.
Primum in nostranepotestate est, quid meminerimus?
Duo Reges: constructio interrete.
Quid, sietiam iucunda memoria est praeteritorum malorum?
Si quidem, inquit, tollerem,''')


Utilitatis causa amicitia est quaesita.
Lorem ipsum dolor sit amet, consectetur adipiscing elit. 
Collatio igitur ista tenihil iuvat. 
Honesta oratio, Socratica, Platonis etiam. 
Primum in nostranepotestate est, quid meminerimus? 
Duo Reges: constructio interrete.
Quid, sietiam iucunda memoria est praeteritorum malorum? 
Si quidem, inquit, tollerem,


### Read sample1.txt file into RDD and displaying the first 4 elements

In [17]:
rdd= spark.sparkContext.textFile("sample1.txt")
# Collect all lines from the RDD and print them
first_4_lines = rdd.take(4)
for line in first_4_lines:
    print(line)

Utilitatis causa amicitia est quaesita.
Lorem ipsum dolor sit amet, consectetur adipiscing elit. 
Collatio igitur ista tenihil iuvat. 
Honesta oratio, Socratica, Platonis etiam. 


['Utilitatis causa amicitia est quaesita.',
 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. ',
 'Collatio igitur ista tenihil iuvat. ',
 'Honesta oratio, Socratica, Platonis etiam. ']

### Count the total number of rows in RDD

In [18]:
# Count the total number of rows in the RDD
total_rows = rdd.count()
# Print the total number of rows
print("Total number of rows in the RDD:", total_rows)


Total number of rows in the RDD: 8


### Create a function to convert the data into lower case and splitting it

In [19]:

text_rdd = spark.sparkContext.textFile("sample1.txt")

# Function to convert text to lowercase and split into words
def lowercase_and_split(text):
    # Convert to lowercase
    lowercased_text = text.lower()
    # Split into words
    words = lowercased_text.split()
    return words

# Apply the function to the RDD
result_rdd = text_rdd.flatMap(lowercase_and_split)

# Collect and print the result
result = result_rdd.collect()


# Print the result
for word in result:
    print(word)

utilitatis
causa
amicitia
est
quaesita.
lorem
ipsum
dolor
sit
amet,
consectetur
adipiscing
elit.
collatio
igitur
ista
tenihil
iuvat.
honesta
oratio,
socratica,
platonis
etiam.
primum
in
nostra
nepotestate
est,
quid
meminerimus?
duo
reges:
constructio
interrete.
quid,
si
etiam
iucunda
memoria
est
praeteritorum
malorum?
si
quidem,
inquit,
tollerem,


[['utilitatis', 'causa', 'amicitia', 'est', 'quaesita.'],
 ['lorem',
  'ipsum',
  'dolor',
  'sit',
  'amet,',
  'consectetur',
  'adipiscing',
  'elit.'],
 ['collatio', 'igitur', 'ista', 'tenihil', 'iuvat.'],
 ['honesta', 'oratio,', 'socratica,', 'platonis', 'etiam.'],
 ['primum', 'in', 'nostranepotestate', 'est,', 'quid', 'meminerimus?']]

### Remove the stopwords from the previous text. i.e. Remove it.

In [None]:
stopwords = ['a','all','the','as','is','am','an','and',
             'be','been','from','had','I','I’d','why','with']
# Hint: you may need use flatMap

In [23]:
# Load text data from "sample1.txt" into an RDD
text_removestop = spark.sparkContext.textFile("sample1.txt")

# List of stopwords (you can expand this list with more stopwords)
stopwords = ['a','all','the','as','is','am','an','and',
             'be','been','from','had','I','I’d','why','with']

# Function to remove stopwords, convert to lowercase, and split text
def preprocess_text(text):
    # Split the text into words
    words = text.split()
    # Remove stopwords, convert to lowercase, and join back into text
    cleaned_text = " ".join([word.lower() for word in words if word.lower() not in stopwords])
    return cleaned_text

# Apply the function to the RDD
result_rdd = text_removestop.map(preprocess_text)

# Split the cleaned text into words
split_result_rdd = result_rdd.flatMap(lambda text: text.split())

# Collect and print the result
result = split_result_rdd.collect()

# Print the result
for word in result:
    print(word)


utilitatis
causa
amicitia
est
quaesita.
lorem
ipsum
dolor
sit
amet,
consectetur
adipiscing
elit.
collatio
igitur
ista
tenihil
iuvat.
honesta
oratio,
socratica,
platonis
etiam.
primum
in
nostra
nepotestate
est,
quid
meminerimus?
duo
reges:
constructio
interrete.
quid,
si
etiam
iucunda
memoria
est
praeteritorum
malorum?
si
quidem,
inquit,
tollerem,


['utilitatis',
 'causa',
 'amicitia',
 'est',
 'quaesita.',
 'lorem',
 'ipsum',
 'dolor',
 'sit',
 'amet,',
 'consectetur',
 'adipiscing',
 'elit.',
 'collatio',
 'igitur',
 'ista',
 'tenihil',
 'iuvat.',
 'honesta',
 'oratio,',
 'socratica,',
 'platonis',
 'etiam.',
 'primum',
 'in',
 'nostranepotestate',
 'est,',
 'quid',
 'meminerimus?',
 'duo',
 'reges:',
 'constructio',
 'interrete.',
 'quid,',
 'sietiam',
 'iucunda',
 'memoria',
 'est',
 'praeteritorum',
 'malorum?',
 'si',
 'quidem,',
 'inquit,',
 'tollerem,']

*italicized text* Find the words starting with ‘c’

In [24]:
def find_words_starting_with_c(line):
    # Convert to lowercase, split into words, and filter words starting with 'c'
    words = line.lower().split()
    c_words = [word for word in words if word.startswith('c')]
    return c_words

# Apply the filter operation to find words starting with 'c' in each line
c_words_rdd = rdd.flatMap(find_words_starting_with_c)

# Collect and display the words starting with 'c'
c_words = c_words_rdd.collect()

# Print the words starting with 'c'
for word in c_words:
    print(word)

causa
consectetur
collatio
constructio


['causa', 'consectetur', 'collatio', 'constructio']

### Reduce the data by key and sum it (use the data from the following list)

> Indented block



In [26]:
list = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25)
        , ('J-Hope', 25), ('Suga', 26), ('Jin', 27)
       , ('J-Hope', 12), ('Suga', 25), ('Jin', 34)
       , ('JK', 32), ('V', 44), ('Jimin',14), ('RM', 35)]
# Hint: use reduceByKey

In [27]:


rdd = spark.sparkContext.parallelize(list)

# Use reduceByKey to sum values for each key
summed_rdd = rdd.reduceByKey(lambda a, b: a + b)

# Collect and display the result
result = summed_rdd.collect()

# Print the summed data
for key, value in result:
    print(f'{key}: {value}')


Suga: 51
Jin: 61
JK: 54
V: 68
Jimin: 38
RM: 60
J-Hope: 37


### Creat some key value pairs RDDs

In [29]:
rdd1 = spark.sparkContext.parallelize([('a',2),('b',3)])
rdd2 = spark.sparkContext.parallelize([('a',9),('b',7),('c',10)])

In [31]:
rdd1.collect()

[('a', 2), ('b', 3)]

In [32]:
rdd2.collect()

[('a', 9), ('b', 7), ('c', 10)]

[('a', 2), ('b', 3)]

[('a', 9), ('b', 7), ('c', 10)]

### Perform Join operation on the RDDs (rdd1,rdd2)

[('b', (3, 7)), ('a', (2, 9))]

In [33]:
joined_rdd = rdd1.join(rdd2)
result = joined_rdd.collect()
print(result)

[('b', (3, 7)), ('a', (2, 9))]
