In [None]:
# Create Spark Context with SparkConf
from pyspark import SparkConf, SparkContext
conf = SparkConf()
conf.setMaster("local[*]").setAppName("app")
sc = SparkContext.getOrCreate(conf)
sc.setLogLevel("ERROR")

In [None]:
inputFilePath="pagecounts"
rdd = sc.textFile(inputFilePath)

In [None]:
# Exercise 1

rdd1 = rdd.map(lambda line: line.split(' '))
for line in rdd1.take(10):
    print(line)
    

In [None]:
# Exercise 2

numLines = rdd.count()
print('Number of pages:',numLines)


In [None]:
# Exercise 3

# using the mapped rdd
enPages1 = rdd1.filter(lambda line: line[0].__eq__('en')) # this is the same as line[0] == 'en'
# using the raw rdd
enPages2 = rdd.filter(lambda line: line.startswith('en')) # :)
enPages = enPages2

numEnLines1 = enPages1.count()
numEnLines2 = enPages2.count()
print('\nNumber of EN pages:',numEnLines1)
print('\nNumber of EN pages:',numEnLines2)


In [None]:
# Exercise 4

enPagesTuples = enPages.flatMap(lambda line: [(pieces[0], pieces[1], int(pieces[2]), int(pieces[3]))
                                                 for pieces in [line.split(" ")] if len(pieces) == 4])
for line in enPagesTuples.take(10):
    print(line)

In [None]:
# Exercise 5

topSortedEnPages = enPagesTuples.sortBy(lambda x: x[2], ascending=False) \
.take(5) \

for page in topSortedEnPages:
    print(page)

In [None]:
# Exercise 6

import time
start_time = time.time()

top = enPagesTuples.sortBy(lambda x: x[2], ascending=False).first()

end_time = time.time()
elapsed_time = end_time - start_time

print('Name: ' + top[1] + "\tNumber of Visists: " + str(top[2]))
print(f"Execution time: {elapsed_time} seconds")

In [None]:
# Exercise 6
# Another option. Lower complexity

import time
start_time = time.time()

maxValue = enPagesTuples.map(lambda x: x[2]).reduce(lambda x, y: max(x, y))
top = enPagesTuples.filter(lambda x: x[2] == maxValue).first()

end_time = time.time()
elapsed_time = end_time - start_time

print('Name: ' + top[1] + "\tNumber of Visists: " + str(top[2]))
print(f"Execution time: {elapsed_time} seconds")

In [None]:
# Exercise 6
# Yet another, even better option

import time
start_time = time.time()

top = enPagesTuples.reduce(lambda t1, t2: t1 if t1[2] > t2[2] else t2)

end_time = time.time()
elapsed_time = end_time - start_time

print('Name: ' + top[1] + "\tNumber of Visists: " + str(top[2]))
print(f"Execution time: {elapsed_time} seconds")

In [None]:
# Exercise 7

from pyspark.rdd import RDD

def histogram(page_rdd: RDD, n_bins: int) -> RDD:
    # First, calculate the bounds (min and max)
    bounds = page_rdd.map(lambda x: (x[2], x[2])).reduce(lambda t1, t2: (min(t1[0], t2[0]), max(t1[1], t2[1])))

    hist_range = bounds[1] - bounds[0]
    bin_width = hist_range / n_bins
    print('bounds:' ,bounds)
    for i in range(0,n_bins-1):
        print(bounds[0]+bin_width*i,bounds[0]+ bin_width*(i+1))

    histogram_result = page_rdd.map(lambda t: ((t[2] - bounds[0]) // bin_width) * bin_width + bounds[0]) \
            .groupBy(lambda x: x) \
            .map(lambda t: (t[0], len(list(t[1])))) \
            .sortBy(lambda x: x[0])
    

    return histogram_result
    
print('\n Histogram bins:')
for item in histogram(enPagesTuples, 20).collect():
    print(item)

In [None]:
# Exercise 7 Pro

from pyspark import RDD

def create_histogram(page_rdd: RDD, n_bins: int):
    # First, calculate the bounds (min and max)
    bounds = page_rdd.map(lambda x: (x[2], x[2])) \
                    .reduce(lambda t1, t2: (min(t1[0], t2[0]), max(t1[1], t2[1])))

    hist_range = bounds[1] - bounds[0]
    bin_width = hist_range / n_bins

    histogram_result = page_rdd.map(lambda t: ((t[2] - bounds[0]) // bin_width) * bin_width + bounds[0]) \
                           .groupBy(lambda x: x) \
                           .map(lambda t: (t[0], len(list(t[1])))) \
                           .sortBy(lambda x: x[0]) \
                           .collect()  # Collect the results to print

    max_count = max(histogram_result, key=lambda x: x[1])[1]
    print('Histogram:')
    for bin_start, count in histogram_result:
        bin_end = bin_start + bin_width
        bar_length = int(40 * count / max_count)  # Adjust the scale for visualization
        print(f"{bin_start:.2f} - {bin_end:.2f}: {'*' * bar_length} ({count})")

# Assuming you have enPagesTuples and nBins defined elsewhere
create_histogram(enPagesTuples, 20)

In [None]:
# Stop the SparkContext
sc.stop()