In [1]:
from pyspark.sql import SparkSession
from operator import add

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.119:7077") \
        .appName("LingkaiZhuA3")\
        .config("spark.executor.cores",8)\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .config("spark.local.dir", "~/")\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/19 21:26:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/02/19 21:26:38 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
22/02/19 21:26:40 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


# Part A - Working with the RDD API

## Question A.1

A.1.1 Read the English transcripts with Spark, and count the number of lines.

In [2]:
lines_english = spark_context.textFile("hdfs://192.168.2.119:9000/europarl/europarl-v7.de-en.en")
print(lines_english.first())
lines_english1 = lines_english.map(lambda line: line.split('\n'))
line_english_counts = lines_english1.map(lambda w: len(w))
total_english_lines = line_english_counts.reduce(add)
print(f'total number of lines = {total_english_lines}')

                                                                                

Resumption of the session




total number of lines = 1920209


                                                                                

A.1.2 Do the same with the other language (so that you have a separate lineage of RDDs for each).

In [3]:
lines_de = spark_context.textFile("hdfs://192.168.2.119:9000/europarl/europarl-v7.de-en.de")
print(lines_de.first())
lines_de1 = lines_de.map(lambda line: line.split('\n'))
line_de_counts = lines_de1.map(lambda w: len(w))
total_de_lines = line_de_counts.reduce(add)
print(f'total number of lines = {total_de_lines}')

Wiederaufnahme der Sitzungsperiode




total number of lines = 1920209


                                                                                

A.1.3 Verify that the line counts are the same for the two languages.
In this case, the count of the english transcripts is 1920209, which is equal to its original language's text.

A.1.4 Count the number of partitions.

In [4]:
print("number of partitions of the english:", lines_english.getNumPartitions())
print("number of partitions of the original:", lines_de.getNumPartitions())

number of partitions of the english: 3
number of partitions of the original: 3


## Question A.2

A.2.1 Pre-process the text from both RDDs by doing the following:

● Lowercase the text

● Tokenize the text (split on space)

Hint: define a function to run in your driver application to avoid writing this code twice.

In [5]:
def preprocess(lines):
    lowercase_lines = lines.map(lambda line: line.lower())
    words = lowercase_lines\
    .flatMap(lambda line: line.split(' '))\
    .flatMap(lambda line: line.split('\n'))
    return lowercase_lines, words

A.2.2 Inspect 10 entries from each of your RDDs to verify your pre-processing.

In [6]:
# english
[english_lowercase_lines, _] = preprocess(lines_english)
print(english_lowercase_lines.take(10))
print("----------------------------------------------")
# original language
[de_lowercase_lines, _] = preprocess(lines_de)
print(de_lowercase_lines.take(10))

['resumption of the session', 'i declare resumed the session of the european parliament adjourned on friday 17 december 1999, and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.', "although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.", 'you have requested a debate on this subject in the course of the next few days, during this part-session.', "in the meantime, i should like to observe a minute' s silence, as a number of members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the european union.", "please rise, then, for this minute' s silence.", "(the house rose and observed a minute' s silence)", 'madam president, on a point of order.', 'you will be aware from the press and television that there have been a num

A.2.3 Verify that the line counts still match after the pre-processing.

In [7]:
# english
lines_english1 = english_lowercase_lines.map(lambda line: line.split('\n'))
line_english_counts = lines_english1.map(lambda w: len(w))
total_english_lines = line_english_counts.reduce(add)
print(f'total number of lines = {total_english_lines}')



total number of lines = 1920209


                                                                                

In [8]:
# original 
lines_de1 = de_lowercase_lines.map(lambda line: line.split('\n'))
line_de_counts = lines_de1.map(lambda w: len(w))
total_de_lines = line_de_counts.reduce(add)
print(f'total number of lines = {total_de_lines}')



total number of lines = 1920209


                                                                                

A.2.3 Verify that the line counts still match after the pre-processing.

After verification, the line counts are exactly the same as it is before preprocessing.

Total number of lines = 1920209

## Question1 A.3

A.3.1 Use Spark to compute the 10 most frequently according words in the English language corpus. Repeat for the other language.

In [9]:
# english
[_, english_words] = preprocess(lines_english)
english_word_key = english_words.map(lambda w: w.strip()).map(lambda w: (w, 1))
english_word_counts = english_word_key.reduceByKey(add)
print(english_word_counts.takeOrdered(10, key=lambda x: -x[1]))



[('the', 3663193), ('of', 1736975), ('to', 1611788), ('and', 1345073), ('in', 1134026), ('that', 835874), ('a', 810540), ('is', 792564), ('for', 557349), ('we', 551244)]


                                                                                

In [10]:
# original 
[_, de_words] = preprocess(lines_de)
de_word_key = de_words.map(lambda w: w.strip()).map(lambda w: (w, 1))
de_word_counts = de_word_key.reduceByKey(add)
print(de_word_counts.takeOrdered(10, key=lambda x: -x[1]))



[('die', 1980477), ('der', 1710353), ('und', 1337721), ('in', 781362), ('zu', 618872), ('den', 577654), ('wir', 489036), ('für', 478326), ('ich', 469025), ('das', 466127)]


                                                                                

In [11]:
english_words.take(3)
english_word_key.take(3)

[('resumption', 1), ('of', 1), ('the', 1)]

A.3.2 Verify that your results are reasonable.

The pipeline to get the 10 most frequently according words:

1. get the splited words using the 'preprocess' function, e.g ['resumption', 'of', 'the']
2. map step: remove the extra blank space and make a key-value-pair, e.g [('resumption', 1), ('of', 1), ('the', 1)]
3. reduce step: combine the pairs with the same key, add up the corresponding value, e.g ('of', 1), ('of', 1) --> ('of', 2).
4. output the ordered result

## Question A.4

A.4.1 Use this parallel corpus to mine some translations in the form of word pairs, for the two languages. Do this by pairing words found on short lines with the same number of words respectively. We (incorrectly) assume the words stay in the same order when translated.

Follow this approach. Work with the pair of RDDs you created in question A.2.
Hint: make a new pair of RDDs for each step, sv_1, en_1, sv_2, en_2, ...

1. Key the lines by their line number (hint: ZipWithIndex()).
2. Swap the key and value - so that the line number is the key.

In [12]:
# english
[english_lowercase_lines, _] = preprocess(lines_english)
english_lines_index = english_lowercase_lines.zipWithIndex()
english_index_lines = english_lines_index.map(lambda x: (x[1], x[0]))
#print(english_index_lines.take(10))

# original
[de_lowercase_lines, _] = preprocess(lines_de)
de_lines_index = de_lowercase_lines.zipWithIndex()
de_index_lines = de_lines_index.map(lambda x: (x[1], x[0]))
#print(de_index_lines.take(10))

                                                                                

[(0, 'resumption of the session'), (1, 'i declare resumed the session of the european parliament adjourned on friday 17 december 1999, and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.'), (2, "although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful."), (3, 'you have requested a debate on this subject in the course of the next few days, during this part-session.'), (4, "in the meantime, i should like to observe a minute' s silence, as a number of members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the european union."), (5, "please rise, then, for this minute' s silence."), (6, "(the house rose and observed a minute' s silence)"), (7, 'madam president, on a point of order.'), (8, 'you will be aware from the pre



[(0, 'wiederaufnahme der sitzungsperiode'), (1, 'ich erkläre die am freitag, dem 17. dezember unterbrochene sitzungsperiode des europäischen parlaments für wiederaufgenommen, wünsche ihnen nochmals alles gute zum jahreswechsel und hoffe, daß sie schöne ferien hatten.'), (2, 'wie sie feststellen konnten, ist der gefürchtete "millenium-bug " nicht eingetreten. doch sind bürger einiger unserer mitgliedstaaten opfer von schrecklichen naturkatastrophen geworden.'), (3, 'im parlament besteht der wunsch nach einer aussprache im verlauf dieser sitzungsperiode in den nächsten tagen.'), (4, 'heute möchte ich sie bitten - das ist auch der wunsch einiger kolleginnen und kollegen -, allen opfern der stürme, insbesondere in den verschiedenen ländern der europäischen union, in einer schweigeminute zu gedenken.'), (5, 'ich bitte sie, sich zu einer schweigeminute zu erheben.'), (6, '(das parlament erhebt sich zu einer schweigeminute.)'), (7, 'frau präsidentin, zur geschäftsordnung.'), (8, 'wie sie sich

                                                                                

3. Join the two RDDs together according to the line number key, so you have pairs of matching lines.

In [18]:
# join two rdds
de_en_join = de_index_lines.join(english_index_lines)
de_en_match = de_en_join.map(lambda x: (x[1][0], x[1][1]))

In [124]:
de_en_match.take(2)

                                                                                

[('wiederaufnahme der sitzungsperiode', 'resumption of the session'),
 ('(das parlament erhebt sich zu einer schweigeminute.)',
  "(the house rose and observed a minute' s silence)")]

4. Filter to exclude line pairs that have an empty/missing “corresponding” sentence.

In [181]:
def exclude(x):
    if len(x[0]) > 0 and len(x[1]) > 0:
        return x
de_en_exclude_empty = de_en_match.filter(lambda x: exclude(x))

In [182]:
de_en_exclude_empty.take(2)

                                                                                

[('wiederaufnahme der sitzungsperiode', 'resumption of the session'),
 ('(das parlament erhebt sich zu einer schweigeminute.)',
  "(the house rose and observed a minute' s silence)")]

5. Filter to leave only pairs of sentences with a small number of words per sentence, this should give a more reliable translation (you can experiment).

In [183]:
import string 

def leave(x):
    threshold = 5
    de = x[0].split()
    eng = x[1].split()
    if len(de) <= threshold and len(eng) <= threshold:
        if x[0] not in string.punctuation: # remove the tuple only contain punctuations
            return x
de_en_small = de_en_exclude_empty.filter(lambda x: leave(x))

In [184]:
de_en_small.take(10)

                                                                                

[('wiederaufnahme der sitzungsperiode', 'resumption of the session'),
 ('(beifall von der ppe-de-fraktion)', '(applause from the ppe-de group)'),
 ('herr wynn, das ist logisch.', 'mr wynn, that makes sense.'),
 ('ich glaube nicht daran.', 'i do not believe so.'),
 ('vielen dank, frau kommissarin schreyer.',
  'thank you, commissioner schreyer.'),
 ('beide artikel betreffen die abstimmung.', 'these are both about voting.'),
 ('bericht koch (a5-0105/1999)', 'koch report (a5-0105/1999)'),
 ('das ist nicht mehr hinnehmbar.', 'this is no longer tolerable.'),
 ('das ist nicht mehr hinnehmbar.', 'this is no longer tolerable.'),
 ('nun haben wir das ergebnis.', 'and this is the result.')]

6. Filter to leave only pairs of sentences with the same number of words in each sentence.

In [185]:
def same(x):
    de = x[0].split()
    eng = x[1].split()
    if len(de) == len(eng):
        return x
de_en_same = de_en_small.filter(lambda x: same(x))

In [186]:
de_en_same.take(10)

                                                                                

[('herr wynn, das ist logisch.', 'mr wynn, that makes sense.'),
 ('beide artikel betreffen die abstimmung.', 'these are both about voting.'),
 ('bericht koch (a5-0105/1999)', 'koch report (a5-0105/1999)'),
 ('das ist nicht mehr hinnehmbar.', 'this is no longer tolerable.'),
 ('das ist nicht mehr hinnehmbar.', 'this is no longer tolerable.'),
 ('nun haben wir das ergebnis.', 'and this is the result.'),
 ('stürme in europa', 'storms in europe'),
 ('sind die zeitungsmeldungen zutreffend?', 'are these reports accurate?'),
 ('anfrage nr. 37 von (h-0791/99):', 'question no 37 by (h-0791/99):'),
 ('entlastung 1997', '1997 discharge')]

7. For each sentence pair, map so that you pair each (in order) word in the two sentences. We no longer need the line numbers. (hint: use python’s built in zip() function)

In [197]:
import string
import re
p = re.compile(r"[a-zA-Z]+")

def word_pair(x):
    de = x[0].split()
    eng = x[1].split()
    de_eng_zip = zip(de, eng)
    for element in de_eng_zip:
        return element

def remove_punctuation(x):
    word_de = p.findall(x[0])
    word_en = p.findall(x[1])
    if word_de != [] and word_en != []:
        return (word_de[0], word_en[0])

de_en_pair = de_en_same.map(lambda x: word_pair(x)).map(lambda x: remove_punctuation(x)).filter(lambda x: x!=None)

In [198]:
de_en_pair.take(100)

                                                                                

[('herr', 'mr'),
 ('beide', 'these'),
 ('bericht', 'koch'),
 ('das', 'this'),
 ('das', 'this'),
 ('nun', 'and'),
 ('st', 'storms'),
 ('sind', 'are'),
 ('anfrage', 'question'),
 ('heute', 'urgent'),
 ('anhaltender', 'sustained'),
 ('anhaltender', 'sustained'),
 ('die', 'the'),
 ('tagesordnung', 'agenda'),
 ('beifall', 'applause'),
 ('beifall', 'applause'),
 ('beifall', 'applause'),
 ('die', 'the'),
 ('das', 'these'),
 ('die', 'the'),
 ('vielen', 'thank'),
 ('das', 'parliament'),
 ('tagesordnung', 'agenda'),
 ('beifall', 'applause'),
 ('danke', 'thank'),
 ('wir', 'we'),
 ('das', 'i'),
 ('anfrage', 'question'),
 ('anfrage', 'question'),
 ('die', 'the'),
 ('der', 'the'),
 ('bericht', 'cederschi'),
 ('bericht', 'lieneman'),
 ('bericht', 'andersson'),
 ('das', 'first'),
 ('die', 'the'),
 ('anfrage', 'question'),
 ('anfrage', 'question'),
 ('anfrage', 'question'),
 ('anfrage', 'question'),
 ('das', 'mr'),
 ('das', 'that'),
 ('gefordert', 'consistency'),
 ('ist', 'is'),
 ('bericht', 'cederschi

8. Use reduce to count the number of occurrences of the word-translation-pairs.
9. Print some of the most frequently occurring pairs of words.

In [200]:
de_en_key = de_en_pair.map(lambda w: (w, 1))
de_en_counts = de_en_key.reduceByKey(add)
print(de_en_counts.takeOrdered(10, key=lambda x: -x[1]))

[Stage 182:>                                                        (0 + 6) / 6]

[(('die', 'the'), 4914), (('beifall', 'applause'), 3215), (('das', 'that'), 1601), (('vielen', 'thank'), 1269), (('wir', 'we'), 1016), (('schriftliche', 'written'), 996), (('ich', 'i'), 935), (('das', 'this'), 864), (('anfrage', 'question'), 564), (('warum', 'why'), 467)]


                                                                                