# A. 1

In [2]:
from operator import add
from pyspark.sql import SparkSession

spark_session = SparkSession\
       .builder\
       .master("spark://192.168.2.87:7077") \
       .appName("jialun_song_a3_part1")\
       .config("spark.dynamicAllocation.enabled", True)\
       .config("spark.shuffle.service.enabled", True)\
       .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
       .config("spark.executor.cores",4)\
       .getOrCreate()
        
spark_context = spark_session.sparkContext

In [85]:
parallel_corpus_en_rdd = spark_context.textFile("hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.en")
parallel_corpus_sv_rdd = spark_context.textFile("hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.sv")

In [86]:
print(parallel_corpus_en_rdd.count())
parallel_corpus_en_rdd.count() == parallel_corpus_sv_rdd.count()

1862234


True

In [87]:
print(f"Number of partitions of the English transcript: {parallel_corpus_en_rdd.getNumPartitions()}.")
print(f"Number of partitions of the Swedish transcript: {parallel_corpus_sv_rdd.getNumPartitions()}.")

Number of partitions of the English transcript: 2.
Number of partitions of the Swedish transcript: 3.


# A. 2

In [88]:
def pre_process(line):
    return line.lower().split()

en_tokenized_rdd = parallel_corpus_en_rdd.map(pre_process)
en_tokenized_rdd.cache()  # use rdd.is_cached to check; rdd.unpersist() to uncache.
sv_tokenized_rdd = parallel_corpus_sv_rdd.map(pre_process)
sv_tokenized_rdd.cache()

PythonRDD[157] at RDD at PythonRDD.scala:53

In [89]:
print(en_tokenized_rdd.take(10), '\n', sv_tokenized_rdd.take(10))

[['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'on', 'behalf', 'of', '

In [90]:
en_tokenized_rdd.count() == sv_tokenized_rdd.count()

True

# A. 3

In [91]:
def most_frequent_words(rdd, number=10):
    return rdd \
        .flatMap(lambda x:x) \
        .map(lambda x:(x,1)) \
        .reduceByKey(add) \
        .takeOrdered(number, key=lambda x:-x[1])
        
print(most_frequent_words(en_tokenized_rdd), '\n', most_frequent_words(sv_tokenized_rdd))

[('the', 3498574), ('of', 1659884), ('to', 1539823), ('and', 1288620), ('in', 1086089), ('that', 797576), ('a', 773812), ('is', 758087), ('for', 534270), ('we', 522879)] 
 [('att', 1706309), ('och', 1344895), ('i', 1050989), ('det', 924878), ('som', 913302), ('för', 908703), ('av', 738102), ('är', 694389), ('en', 620347), ('vi', 539808)]


In [92]:
# Verification 1 on English result
def word_count_in_line(line):
    d = {}
    for word in line:
        if word in d:
            d[word] += 1
        else:
            d[word] = 1
    return d.items()

en_tokenized_rdd \
    .flatMap(word_count_in_line) \
    .reduceByKey(add) \
    .takeOrdered(10, key=lambda x:-x[1])

[('the', 3498574),
 ('of', 1659884),
 ('to', 1539823),
 ('and', 1288620),
 ('in', 1086089),
 ('that', 797576),
 ('a', 773812),
 ('is', 758087),
 ('for', 534270),
 ('we', 522879)]

In [93]:
# Verification 2 on Swedish result
parallel_corpus_sv_rdd \
    .flatMap(pre_process) \
    .map(lambda x:(x,1)) \
    .reduceByKey(add) \
    .takeOrdered(10, key=lambda x:-x[1])

[('att', 1706309),
 ('och', 1344895),
 ('i', 1050989),
 ('det', 924878),
 ('som', 913302),
 ('för', 908703),
 ('av', 738102),
 ('är', 694389),
 ('en', 620347),
 ('vi', 539808)]

# A. 4

In [94]:
en_line_index_rdd = en_tokenized_rdd.zipWithIndex().map(lambda x:(x[1],x[0]))
sv_line_index_rdd = sv_tokenized_rdd.zipWithIndex().map(lambda x:(x[1],x[0]))
joined_line_index_rdd = en_line_index_rdd.join(sv_line_index_rdd)

In [95]:
joined_line_index_rdd \
    .filter(lambda x:len(x[1][0])==len(x[1][1]) and len(x[1][0])>0) \
    .flatMap(lambda x:zip(x[1][0],x[1][1])) \
    .map(lambda x:(x,1)) \
    .reduceByKey(add) \
    .takeOrdered(80, key=lambda x:-x[1])

[(('and', 'och'), 35291),
 (('is', 'är'), 27871),
 (('i', 'jag'), 27136),
 (('we', 'vi'), 26031),
 (('in', 'i'), 18615),
 (('to', 'att'), 18601),
 (('that', 'att'), 16463),
 (('a', 'en'), 15469),
 (('it', 'det'), 14698),
 (('of', 'av'), 12984),
 (('the', 'den'), 12592),
 (('this', 'detta'), 11429),
 (('not', 'inte'), 11141),
 (('the', 'det'), 10733),
 (('the', 'de'), 10232),
 (('for', 'för'), 10232),
 (('a', 'ett'), 9408),
 (('the', 'att'), 9376),
 (('the', 'i'), 8743),
 (('have', 'har'), 8212),
 (('to', 'till'), 7165),
 (('that', 'det'), 6736),
 (('mr', 'herr'), 6306),
 (('the', 'av'), 6154),
 (('with', 'med'), 5887),
 (('are', 'är'), 5880),
 (('will', 'att'), 5766),
 (('to', 'för'), 5751),
 (('there', 'det'), 5599),
 (('this', 'denna'), 5495),
 (('will', 'kommer'), 5429),
 (('the', 'för'), 5287),
 (('has', 'har'), 5278),
 (('the', 'och'), 5107),
 (('as', 'som'), 4938),
 (('must', 'måste'), 4934),
 (('this', 'det'), 4820),
 (('that', 'som'), 4572),
 (('the', 'som'), 4293),
 (('but', '