In [1]:
import pyspark
from pyspark.sql import SparkSession

# Start notebook
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.119:7077") \
        .appName("victor_hwasser_applicationA")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/02 18:56:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/02 18:56:58 WARN Utils: Service 'sparkDriver' could not bind on port 9998. Attempting port 9999.
22/03/02 18:56:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/03/02 18:57:00 WARN StandaloneAppClient$ClientEndpoint: Failed to connect to master 192.168.2.119:7077
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:301)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:101)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.s

KeyboardInterrupt: 

In [3]:
# A1

from pyspark import SparkConf, SparkContext
#conf = spark_session.conf
sc = spark_session.sparkContext 

text_sv = sc.textFile('hdfs://192.168.2.119:9000/europarl/europarl-v7.sv-en.sv')
text_en = sc.textFile('hdfs://192.168.2.119:9000/europarl/europarl-v7.sv-en.en')

# Uncomment to inspect the data!
#text_sv.take(10)

count_sv = text_sv.count()
count_en = text_en.count()
count_sv_par = text_sv.getNumPartitions()
count_en_par = text_en.getNumPartitions()

if count_sv == count_en:
    print("Both text documents have the same amount of lines:", count_sv)
    print("Number of partitions:", count_sv_par, "and", count_en_par)
else:
    print("The documents doesn't have the same amount of lines, something's wrong!", count_sv, "vs", count_en)
    

[Stage 1:>                                                          (0 + 2) / 2]

Both text documents have the same amount of lines: 1862234
Number of partitions: 3 and 2


                                                                                

In [4]:
# A2

# preprocess text
def pre_text(text):
    # Make each sentence lower case
    text_lower = text.map(lambda x: x.lower())
    # split each sentence into words
    #!! It would make more sence to use flatMap, but this wouldn't work for A 2.3
    words = text_lower.map(lambda x: x.split())
    return words

# get rdd with processed text
words_sv = pre_text(text_sv)
words_en = pre_text(text_en)

# A.2.2 Inspect 10 entries from each of your RDDs to verify your pre-processing.
first_sv = words_sv.take(10)
first_en = words_en.take(10)

print(first_sv)
print(first_en)

# A.2.3 Verify that the line counts still match after the pre-processing.
count_sv = words_sv.count()
count_en = words_en.count()

if count_sv == count_en:
    print("Both text documents have the same amount of lines:", count_sv)
else:
    print("The documents doesn't have the same amount of lines, something's wrong!", count_sv, "vs", count_en)



[['återupptagande', 'av', 'sessionen'], ['jag', 'förklarar', 'europaparlamentets', 'session', 'återupptagen', 'efter', 'avbrottet', 'den', '17', 'december.', 'jag', 'vill', 'på', 'nytt', 'önska', 'er', 'ett', 'gott', 'nytt', 'år', 'och', 'jag', 'hoppas', 'att', 'ni', 'haft', 'en', 'trevlig', 'semester.'], ['som', 'ni', 'kunnat', 'konstatera', 'ägde', '"den', 'stora', 'år', '2000-buggen"', 'aldrig', 'rum.', 'däremot', 'har', 'invånarna', 'i', 'ett', 'antal', 'av', 'våra', 'medlemsländer', 'drabbats', 'av', 'naturkatastrofer', 'som', 'verkligen', 'varit', 'förskräckliga.'], ['ni', 'har', 'begärt', 'en', 'debatt', 'i', 'ämnet', 'under', 'sammanträdesperiodens', 'kommande', 'dagar.'], ['till', 'dess', 'vill', 'jag', 'att', 'vi,', 'som', 'ett', 'antal', 'kolleger', 'begärt,', 'håller', 'en', 'tyst', 'minut', 'för', 'offren', 'för', 'bl.a.', 'stormarna', 'i', 'de', 'länder', 'i', 'europeiska', 'unionen', 'som', 'drabbats.'], ['jag', 'ber', 'er', 'resa', 'er', 'för', 'en', 'tyst', 'minut.'], 

[Stage 5:>                                                          (0 + 2) / 2]

Both text documents have the same amount of lines: 1862234


                                                                                

In [5]:
# A3

# word count, basically a map-reducer
def count_words(text):
    # Make each sentence lower case
    text_lower = text.map(lambda x: x.lower())
    # split each sentence into words and flatten
    words = text_lower.flatMap(lambda x: x.split())
    # reduce words
    counted_words = words.map(lambda x: (x, 1)).reduceByKey(lambda x,y: x + y)
    return counted_words

# The algorithm here is to do a MapReduce, sort the values and take the top ten ones
counted_words_en = count_words(text_en)
sorted_words_en = counted_words_en.sortBy(lambda x: x[1], False)
top_en = sorted_words_en.take(10)
print(top_en)

# Doing the same thing to swedish words
counted_words_sv = count_words(text_sv)
sorted_words_sv = counted_words_sv.sortBy(lambda x: x[1], False)
top_sv = sorted_words_sv.take(10)
print(top_sv)

# A.3.2 Verify that your results are reasonable.
# The most common words are:
# En: the, of, to, and, in, that, a
# Se: att, och, i, det, som, för, av
# This makes sense since these are common words


                                                                                

[('the', 3498574), ('of', 1659884), ('to', 1539823), ('and', 1288620), ('in', 1086089), ('that', 797576), ('a', 773812), ('is', 758087), ('for', 534270), ('we', 522879)]


[Stage 18:>                                                         (0 + 3) / 3]

[('att', 1706309), ('och', 1344895), ('i', 1050989), ('det', 924878), ('som', 913302), ('för', 908703), ('av', 738102), ('är', 694389), ('en', 620347), ('vi', 539808)]


                                                                                

In [7]:
# A4

sv_1 = words_sv.zipWithIndex()
en_1 = words_en.zipWithIndex()
sv_2 = sv_1.map(lambda x: (x[1],x[0]))
en_2 = en_1.map(lambda x: (x[1],x[0]))
step3 = sv_2.join(en_2)
step4 = step3.filter(lambda x: x[1][0] != [] and x[1][1] != [])
step5 = step4.filter(lambda x: len(x[1][0]) < 4)
step6 = step5.filter(lambda x: len(x[1][0]) == len(x[1][1]))
step7 = step6.flatMap(lambda x: list(zip(x[1][0], x[1][1])))
step8 = step7.map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y)
step9 = step8.sortBy(lambda x: x[1], False)
print(step9.take(10))

# The results seems right. A lot of applauses apparently.

                                                                                

[(('(applåder)', '(applause)'), 2546), (('.', '.'), 2081), (('är', 'is'), 792), (('applåder', 'applause'), 451), (('2.', '2.'), 438), (('1.', '1.'), 438), (('3.', '3.'), 405), (('varför?', 'why?'), 369), (('det', 'that'), 291), (('tack,', 'thank'), 264)]
