In [2]:
from pyspark.sql import SparkSession
from operator import add 

In [3]:
# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.119:7077") \
        .appName("liang_cheng")\
        .config("spark.executor.cores",2)\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/05 23:32:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/05 23:32:40 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [4]:
# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

In [5]:
spark_context

In [13]:
# A.1.1
en_count = spark_context.textFile("hdfs://192.168.2.119:9000/europarl/europarl-v7.sv-en.en").count()
print("The number of lines in English transcripts: ", en_count)

[Stage 7:>                                                          (0 + 2) / 2]

The number of lines in English transcripts:  1862234


                                                                                

In [14]:
# A.1.2
sv_count = spark_context.textFile("hdfs://192.168.2.119:9000/europarl/europarl-v7.sv-en.sv").count()
print("The number of lines in Swedish transcripts: ", sv_count)

[Stage 8:>                                                          (0 + 2) / 3]

The number of lines in Swedish transcripts:  1862234


                                                                                

In [17]:
# A.1.3
print("The line counts are the same for the two languages" if en_count==sv_count else "The line counts are the not same for the two languages")

The line counts are the same for the two languages


In [15]:
# A.1.4
print("The number of partitions in English transcripts: ", spark_context.textFile("hdfs://192.168.2.119:9000/europarl/europarl-v7.sv-en.en").getNumPartitions())
print("The number of partitions in Swedish transcripts: ", spark_context.textFile("hdfs://192.168.2.119:9000/europarl/europarl-v7.sv-en.sv").getNumPartitions())

The number of partitions in English transcripts:  2
The number of partitions in Swedish transcripts:  3


In [6]:
# A.2.1
def preProcess(i):
    i = i.lower()
    #for j in i:
    i = i.split(' ')
    return i

In [7]:
# A.2.2
new_en = spark_context.textFile("hdfs://192.168.2.119:9000/europarl/europarl-v7.sv-en.en").map(preProcess)
print("new English: \n", new_en.take(10))
new_sv = spark_context.textFile("hdfs://192.168.2.119:9000/europarl/europarl-v7.sv-en.sv").map(preProcess)
print("new Swedish: \n", new_sv.take(10))

                                                                                

new English: 
 [['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'on', 'b

In [31]:
# A.2.3
print("The line counts are still the same for the two languages, the number of lines both are ", new_en.count() if new_en.count()==new_sv.count() else "The line counts are not the same for the two languages")

[Stage 21:>                                                         (0 + 2) / 2]

The line counts are still the same for the two languages, the number of lines both are  1862234


                                                                                

In [38]:
# A.3.1
all_en = spark_context.textFile("hdfs://192.168.2.119:9000/europarl/europarl-v7.sv-en.en").flatMap(preProcess)
all_en_and_count = all_en.map(lambda w: w.strip())\
    .map(lambda w: (w,1))
en_counts = all_en_and_count.reduceByKey(add)
print("Top 10 English: \n", en_counts.takeOrdered(10, key=lambda x: -x[1]))

all_sv = spark_context.textFile("hdfs://192.168.2.119:9000/europarl/europarl-v7.sv-en.sv").flatMap(preProcess)
all_sv_and_count = all_sv.map(lambda w: w.strip())\
    .map(lambda w: (w,1))
sv_counts = all_sv_and_count.reduceByKey(add)
print("Top 10 Swedish: \n", sv_counts.takeOrdered(10, key=lambda x: -x[1]))

                                                                                

Top 10 English: 
 [('the', 3498452), ('of', 1659758), ('to', 1539760), ('and', 1288402), ('in', 1085994), ('that', 797519), ('a', 773522), ('is', 758050), ('for', 534242), ('we', 522851)]




Top 10 Swedish: 
 [('att', 1706293), ('och', 1344830), ('i', 1050774), ('det', 924868), ('som', 913276), ('för', 908680), ('av', 738068), ('är', 694381), ('en', 620310), ('vi', 539797)]


                                                                                

In [8]:
# A.4.1
en_1 = new_en.zipWithIndex()
sv_1 = new_sv.zipWithIndex()
print("English with index: \n", en_1.take(4))
print("Swedish with index: \n", sv_1.take(4))

                                                                                

English with index: 
 [(['resumption', 'of', 'the', 'session'], 0), (['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], 1), (['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], 2), (['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], 3)]
Swedish with index: 
 [(['återupptagande', 'av', 'sessionen'], 0), (['jag', 'förklarar', 'europaparlamentets', 'session', 'återupptagen', 'ef

In [10]:
# A.4.2
en_2 = en_1.map(lambda x:(x[1], x[0]))
sv_2 = sv_1.map(lambda x:(x[1], x[0]))
print("English with index: \n", en_2.take(3))
print("Swedish with index: \n", sv_2.take(3))

English with index: 
 [(0, ['resumption', 'of', 'the', 'session']), (1, ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.']), (2, ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'])]
Swedish with index: 
 [(0, ['återupptagande', 'av', 'sessionen']), (1, ['jag', 'förklarar', 'europaparlamentets', 'session', 'återupptagen', 'efter', 'avbrottet', 'den', '17', 'december.', 'jag', 'vill', 'på', 'nytt', 'önska', 'er', 'ett', 'gott', 'nytt', 'år', 'och', 'jag', 'hoppas', 'att', 'ni', 'haft', 'en'

In [12]:
# A.4.3
en_join_sv = en_2.join(sv_2)
print("English join Swedish on index: \n", en_join_sv.take(4))

[Stage 11:>                                                         (0 + 1) / 1]

English join Swedish on index: 
 [(375, (['this', 'means', 'that', 'there', 'must', 'be', 'a', 'comprehensive', 'partnership', 'between', 'local', 'authorities', 'and', 'national', 'governments', 'with', 'regard', 'to', 'how', 'these', 'funds', 'are', 'to', 'be', 'spent.'], ['detta', 'innebär', 'att', 'det', 'måste', 'råda', 'ett', 'heltäckande', 'partnerskap', 'mellan', 'lokala', 'myndigheter', 'och', 'nationella', 'regeringar', 'med', 'hänsyn', 'till', 'hur', 'dessa', 'medel', 'skall', 'användas.'])), (480, (['the', 'european', "commission'", 's', 'sixth', 'report', 'presents', 'very', 'valuable', 'conclusions.'], ['europeiska', 'kommissionens', 'sjätte', 'rapport', 'innehåller', 'värdefulla', 'slutsatser.'])), (915, (['if', 'this', 'comes', 'to', 'nothing,', 'then', 'the', 'government', 'has', 'to', 'step', 'in.'], ['om', 'denna', 'ansträngning', 'misslyckas', 'måste', 'en', 'statsmakt', 'ingripa.'])), (1735, (['(pt)', 'by', 'proposing', 'to', 'apply', 'the', 'principle', 'of', 'sub

                                                                                

In [13]:
# A.4.4
en_sv_4 = en_join_sv.filter(lambda x: x[1][0] != [''] and x[1][1] != [''])
print("words don't begin with empty: \n", en_sv_4.take(4))

[Stage 13:>                                                         (0 + 1) / 1]

words don't begin with empty: 
 [(189755, (['i', 'welcome', 'this,', 'especially', 'since', 'we', 'have', 'ended', 'up', 'with', 'a', 'budget', 'surplus', 'of', 'some', 'eur', '11', 'billion.'], ['det', 'anser', 'jag', 'vara', 'någonting', 'positivt,', 'i', 'synnerhet', 'eftersom', 'vi', 'hade', 'omkring', '11', 'miljarder', 'euro', 'kvar', 'av', 'budgeten', 'förra', 'året.'])), (512085, (['\xa0\xa0', 'mr', 'president,', 'doping', 'is', 'the', 'cancer', 'of', 'modern', 'sports.'], ['\xa0\xa0', '–', 'herr', 'talman!', 'dopning', 'är', 'den', 'moderna', 'idrottens', 'cancer.'])), (764580, (['but', 'as', 'i', 'said', 'in', 'the', 'beginning,', 'we', 'will', 'strive', 'to', 'make', 'as', 'much', 'information', 'as', 'possible', 'available', 'before', '15', 'november', 'and', 'before', 'the', 'final', 'conciliation', 'with', 'both', 'arms', 'of', 'the', 'budgetary', 'authority,', 'so', 'as', 'to', 'be', 'able', 'to', 'lift', 'as', 'much', 'as', 'we', 'can.'], ['men', 'som', 'jag', 'sa', 'i'

                                                                                

In [14]:
# A.4.5
en_sv_5 = en_sv_4.filter(lambda x: len(x[1][0]) <= 10 and len(x[1][1]) <= 10)
print("number of words less than 10: \n", en_sv_5.take(4))

[Stage 15:>                                                         (0 + 1) / 1]

number of words less than 10: 
 [(512085, (['\xa0\xa0', 'mr', 'president,', 'doping', 'is', 'the', 'cancer', 'of', 'modern', 'sports.'], ['\xa0\xa0', '–', 'herr', 'talman!', 'dopning', 'är', 'den', 'moderna', 'idrottens', 'cancer.'])), (130725, (['a', 'democracy', 'has', 'independent', 'supervisory', 'agencies.'], ['det', 'är', 'ett', 'bidrag', 'till', 'säkerheten.'])), (149540, (['i', 'thank', 'parliament', 'and', 'the', 'rapporteur', 'very', 'much', 'indeed.'], ['jag', 'tackar', 'parlamentet', 'och', 'föredraganden', 'så', 'mycket.'])), (167810, (['i', 'just', 'wanted', 'to', 'point', 'that', 'out.'], ['jag', 'ville', 'göra', 'er', 'uppmärksamma', 'på', 'det.']))]


                                                                                

In [15]:
# A.4.6
en_sv_6 = en_sv_5.filter(lambda x: len(x[1][0]) == len(x[1][1]))
print("English has same words as Swedish: \n", en_sv_6.take(4))

[Stage 17:>                                                         (0 + 1) / 1]

English has same words as Swdish: 
 [(40530, (['mr', 'swoboda,', 'the', 'vote', 'is', 'scheduled', 'for', 'tomorrow', 'morning.'], ['ledamot', 'swoboda!', 'omröstningen', 'är', 'planerad', 'till', 'tidigt', 'i', 'morgon.'])), (63570, (['it', 'is', 'a', 'question', 'of', 'respecting', 'national', 'sovereignty.'], ['det', 'handlar', 'om', 'att', 'respektera', 'den', 'nationella', 'suveräniteten.'])), (78155, (['the', 'debate', 'is', 'closed.'], ['jag', 'förklarar', 'debatten', 'avslutad.'])), (86465, (['this', 'was', 'also', 'untrue.'], ['detta', 'var', 'också', 'osant.']))]


                                                                                

In [16]:
# A.4.7
en_sv_7 = en_sv_6.map(lambda x: list(zip(x[1][0],x[1][1])))
print("English zip with Swedish: \n", en_sv_7.take(4))

[Stage 19:>                                                         (0 + 1) / 1]

English zip with Swedish: 
 [[('we', 'vi'), ('know', 'vet'), ('that', 'att'), ('you', 'ni'), ('have', 'har'), ('an', 'en'), ('important', 'viktig'), ('role', 'roll'), ('in', 'i'), ('this.', 'detta.')], [('germany', 'till'), ('has', 'och'), ('acknowledged', 'från'), ('its', 'har'), ('obligation', 'tyskland'), ('from', 'erkänt'), ('time', 'denna'), ('to', 'sin'), ('time.', 'skyldighet.')], [('the', 'de'), ('challenges', 'utmaningar'), ('faced', 'som'), ('by', 'eu'), ('the', 'står'), ('eu', 'inför'), ('are', 'är'), ('exciting', 'spännande'), ('and', 'och'), ('historic.', 'historiska.')], [('-', 'jag'), ('thank', 'tackar'), ('you,', 'er,'), ('mr', 'herr'), ('oostlander.', 'oostlander.')]]


                                                                                

In [17]:
# A.4.8
word_pairs_count = en_sv_7.map(lambda wp: len(wp)).reduce(add)
print("the number of occurrences of the word-translation-pairs: ", word_pairs_count)



the number of occurrences of the word-translation-pairs:  494715


                                                                                

In [19]:
# A.4.9
en_sv_9 = en_sv_6.flatMap(lambda x: list(zip(x[1][0], x[1][1]))).map(lambda x:(x,1))
freq = en_sv_9.reduceByKey(add).sortBy(lambda x:x[1], False).take(10)
print("Top 10 zip: \n", freq)

                                                                                

Top 10 zip: 
 [(('is', 'är'), 11221), (('we', 'vi'), 6594), (('i', 'jag'), 6123), (('and', 'och'), 3926), (('this', 'detta'), 3782), (('a', 'en'), 3496), (('it', 'det'), 3416), (('not', 'inte'), 3097), (('that', 'det'), 3042), (('closed.', 'avslutad.'), 2964)]


In [None]:
spark_context.stop()