In [1]:
# Create Spark Context with SparkConf
from pyspark import SparkConf, SparkContext
conf = SparkConf()
sc = SparkContext.getOrCreate(conf)


23/12/10 19:37:45 WARN Utils: Your hostname, user-HP-EliteBook-840-G7-Notebook-PC resolves to a loopback address: 127.0.1.1; using 192.168.1.141 instead (on interface wlp0s20f3)


In [2]:
# Excercise 1
# Add the phone prefix to the numbers using as reference the International Calling Codes
# Use a Broadcast Variable

input_data = [("Simón","Bolivar","VEN","489 895 965"),
    ("Fidel","Castro","CU","956 268 348"),
    ("Jose","Doroteo","MEX","985 621 444"),
    ("Ernesto","Guevara","AR","895 325 481"),
    ("Hugo","Chávez","VE","489 895 965"),
    ("Camilo","Cienfuegos","CUB","956 268 348"),
    ("Andrés Manuel","López","ME","985 621 444"),
    ("Juan Domingo","Perón","ARG","985 621 444"),
  ]

rdd = sc.parallelize(input_data)

In [3]:
# Solution

states = {"VEN VE":"+58", "CU CUB":"+53", "ME MEX":"+52", "AR ARG":"+54"}
transformed_states = {}

for key, value in states.items():
    codes = key.split()
    for code in codes:
        transformed_states[code] = value

print(transformed_states)
broadcastStates = sc.broadcast(transformed_states)

def country_convert(code):
    return broadcastStates.value[code]

result = rdd.map(lambda x: (x[0],x[1],x[2],country_convert(x[2])+" "+x[3])).collect()
print('\n')
for item in result:
    print(item)

{'VEN': '+58', 'VE': '+58', 'CU': '+53', 'CUB': '+53', 'ME': '+52', 'MEX': '+52', 'AR': '+54', 'ARG': '+54'}


('Simón', 'Bolivar', 'VEN', '+58 489 895 965')
('Fidel', 'Castro', 'CU', '+53 956 268 348')
('Jose', 'Doroteo', 'MEX', '+52 985 621 444')
('Ernesto', 'Guevara', 'AR', '+54 895 325 481')
('Hugo', 'Chávez', 'VE', '+58 489 895 965')
('Camilo', 'Cienfuegos', 'CUB', '+53 956 268 348')
('Andrés Manuel', 'López', 'ME', '+52 985 621 444')
('Juan Domingo', 'Perón', 'ARG', '+54 985 621 444')


In [4]:
# Excercise 2
# Count the number of times the word 'to' appears in a line and the number of lines in the bible.txt file
# Use Accumulators

input_file_path="bible.txt"
rdd = sc.textFile(input_file_path)

In [6]:
# Solution

keyword = "to"
accumulator_word = sc.accumulator(0)
accumulator_lines = sc.accumulator(0)

# Define a function to check if a line contains the keyword and update the accumulator
def process_line(line):
    global accumulator_word
    global accumulator_lines
    accumulator_lines += 1
    if keyword in line:
        accumulator_word += 1

# Load file
rdd = sc.textFile(input_file_path)

# Use the accumulators
rdd.foreach(process_line)

print("Number of lines containing the keyword '{}': {} in a total of {} lines" \
      .format(keyword, accumulator_word.value, accumulator_lines.value))

Number of lines containing the keyword 'to': 16548 in a total of 30383 lines


In [7]:
# Excercise 3
# Write the RDD containing the pagecounts dataset 
# Write the RDD but with only 2 partitions+
# Use Repartition

input_file_path="pagecounts"
rdd = sc.textFile(input_file_path)

In [8]:
# Solution

reparted_rdd = rdd.repartition(2)

output_directory = 'output'
output_reparted_directory = 'output_reparted'

rdd.saveAsTextFile(output_directory)
reparted_rdd.saveAsTextFile(output_reparted_directory)

In [9]:
# Excercise 4
# Check the differences in computation time when using cache method on an rdd
# read pagecount files and count lines with and without using cache method
# show the time differences
# Use Cache

input_file_path="pagecounts"
rdd = sc.textFile(input_file_path)


In [10]:
# Solution
import time

cache_rdd = rdd.cache()

start_time = time.time()
cache_rdd.count()
end_time = time.time()
total_time = end_time - start_time
print(f"Execution time without cache: {total_time:.6f} seconds")

start_time = time.time()
cache_rdd.count()
end_time = time.time()
total_time = end_time - start_time
print(f"Execution time with cache: {total_time:.6f} seconds")

# cache is the shorthand for persist(StorageLevel.MEMORY_ONLY)

Execution time without cache: 5.474665 seconds
Execution time with cache: 3.362903 seconds


In [None]:
# Excercise 5
# use spark-submit to launch the app.py file by yourself
# :)

In [6]:
sc.stop()