In [1]:
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("local") \
        .appName("simple_example")\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

In [3]:
def add(a, b):
    # commutative and associative!
    return a + b

rdd = spark_context.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 3)

result = rdd.map(lambda x: x * 2)\
            .reduce(add)

print(result)

#See: http://spark.apache.org/docs/2.3.0/api/python/pyspark.html

110


In [5]:
# release the cores for another application!
spark_context.stop()

In [35]:
#---------------A.1.1----------------

from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("local") \
        .appName("simple_example")\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

#Cashing text files
swedish_txt = spark_context.textFile('txts/europarl-v7.sv-en.sv').cache()
english_txt = spark_context.textFile('txts/europarl-v7.sv-en.en').cache()

english_txt.count()


1862234

In [37]:
#---------------A.1.2----------------

from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("local") \
        .appName("simple_example")\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

#Cashing text files
swedish_txt = spark_context.textFile('txts/europarl-v7.sv-en.sv').cache()
english_txt = spark_context.textFile('txts/europarl-v7.sv-en.en').cache()

print("English:")
print(english_txt.count())
print("Swedish:")
print(swedish_txt.count())

English:
1862234
Swedish:
1862234


In [39]:
#---------------A.1.3----------------

from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("local") \
        .appName("simple_example")\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

#Cashing text files
swedish_txt = spark_context.textFile('txts/europarl-v7.sv-en.sv').cache()
english_txt = spark_context.textFile('txts/europarl-v7.sv-en.en').cache()

print("English:")
print(english_txt.count())
print("Swedish:")
print(swedish_txt.count())

if(english_txt.count()==swedish_txt.count()):
    print("They have the same amount!!!")
else:
    print("They DO NOT have the same amount!!!")
        


English:
1862234
Swedish:
1862234
They have the same amount!!!


In [42]:
#---------------A.1.4----------------

from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("local") \
        .appName("simple_example")\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

#Cashing text files
swedish_txt = spark_context.textFile('txts/europarl-v7.sv-en.sv').cache()
english_txt = spark_context.textFile('txts/europarl-v7.sv-en.en').cache()

print("English:")
print(english_txt.getNumPartitions())
print("Swedish:")
print(swedish_txt.getNumPartitions())


English:
9
Swedish:
9


In [3]:
#---------------A.2.1 and A.2.2----------------

from pyspark.sql import SparkSession
from operator import add

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("local") \
        .appName("simple_example")\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

#Cashing text files
swedish_txt = spark_context.textFile('txts/europarl-v7.sv-en.sv').cache()
english_txt = spark_context.textFile('txts/europarl-v7.sv-en.en').cache()

english_txt\
.map(lambda x: x.lower())\
.map(lambda w: w.strip()).cache().take(100)


['resumption of the session',
 'i declare resumed the session of the european parliament adjourned on friday 17 december 1999, and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.',
 "although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.",
 'you have requested a debate on this subject in the course of the next few days, during this part-session.',
 "in the meantime, i should like to observe a minute' s silence, as a number of members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the european union.",
 "please rise, then, for this minute' s silence.",
 "(the house rose and observed a minute' s silence)",
 'madam president, on a point of order.',
 'you will be aware from the press and television that there have be

In [6]:
#---------------A.2.3----------------

from pyspark.sql import SparkSession
from operator import add

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("local") \
        .appName("simple_example")\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

#Cashing text files
swedish_txt = spark_context.textFile('txts/europarl-v7.sv-en.sv').cache()
english_txt = spark_context.textFile('txts/europarl-v7.sv-en.en').cache()

en_count = english_txt\
.map(lambda x: x.lower())\
.map(lambda w: w.strip()).count()

sv_count = swedish_txt\
.map(lambda x: x.lower())\
.map(lambda w: w.strip()).count()

if(en_count==sv_count):
    print("They have the same amount!!!")
    print(en_count)
    print(sv_count)
else:
    print("They DO NOT have the same amount!!!")
    print(en_count)
    print(sv_count)


They have the same amount!!!
1862234
1862234


In [7]:
#---------------A.3.1-first part----------------

from pyspark.sql import SparkSession
from operator import add

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("local") \
        .appName("simple_example")\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

#Cashing text files
swedish_txt = spark_context.textFile('txts/europarl-v7.sv-en.sv').cache()
english_txt = spark_context.textFile('txts/europarl-v7.sv-en.en').cache()

english_txt\
.map(lambda x: x.lower())\
.flatMap(lambda t: t.split(' '))\
.flatMap(lambda w: w.split('\n'))\
.map(lambda w: w.strip()).cache()\
.map(lambda w: (w,1))\
.reduceByKey(add)\
.takeOrdered(10, key=lambda x: -x[1])


[('the', 3498452),
 ('of', 1659758),
 ('to', 1539760),
 ('and', 1288402),
 ('in', 1085994),
 ('that', 797519),
 ('a', 773522),
 ('is', 758050),
 ('for', 534242),
 ('we', 522851)]

In [9]:
#---------------A.3.1-second part----------------

from pyspark.sql import SparkSession
from operator import add

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("local") \
        .appName("simple_example")\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

#Cashing text files
swedish_txt = spark_context.textFile('txts/europarl-v7.sv-en.sv').cache()
english_txt = spark_context.textFile('txts/europarl-v7.sv-en.en').cache()

swedish_txt\
.map(lambda x: x.lower())\
.flatMap(lambda t: t.split(' '))\
.flatMap(lambda w: w.split('\n'))\
.map(lambda w: w.strip()).cache()\
.map(lambda w: (w,1))\
.reduceByKey(add)\
.takeOrdered(10, key=lambda x: -x[1])



[('att', 1706293),
 ('och', 1344830),
 ('i', 1050774),
 ('det', 924868),
 ('som', 913276),
 ('för', 908680),
 ('av', 738068),
 ('är', 694381),
 ('en', 620310),
 ('vi', 539797)]

#---------------A.3.2----------------

Since the english 10 most commonly mentioned words are:
    
[('the', 3498452),
 ('of', 1659758),
 ('to', 1539760),
 ('and', 1288402),
 ('in', 1085994),
 ('that', 797519),
 ('a', 773522),
 ('is', 758050),
 ('for', 534242),
 ('we', 522851)]
    
and the swedish 10 most commonly mentioned words are:
    
[('att', 1706293),
 ('och', 1344830),
 ('i', 1050774),
 ('det', 924868),
 ('som', 913276),
 ('för', 908680),
 ('av', 738068),
 ('är', 694381),
 ('en', 620310),
 ('vi', 539797)]

In [10]:
#---------------A.4.1----------------



In [16]:
#1: Record the engish line numbers associated with each line (hint: ZipWithIndex())
#2: Swap the key and value - so that the line number is the key
import re

en_rdd = english_txt\
.map(lambda x: x.lower())\
.map(lambda x: re.sub(r'\W+', ' ', x))\
.map(lambda x: re.sub(r'[0-9]+', '', x))\
.flatMap(lambda w: w.split('\n'))\
.map(lambda w: w.strip())\
.zipWithIndex()\
.map(lambda z: (z[1], z[0]))

In [17]:
#1: Record the swedish line numbers associated with each line (hint: ZipWithIndex())
#2: Swap the key and value - so that the line number is the key

sv_rdd = swedish_txt\
.map(lambda x: x.lower())\
.map(lambda x: re.sub(r'\W+', ' ', x))\
.map(lambda x: re.sub(r'[0-9]+', '', x))\
.flatMap(lambda w: w.split('\n'))\
.map(lambda w: w.strip())\
.zipWithIndex()\
.map(lambda z: (z[1], z[0]))

In [21]:
#3: Match the lines in each corpus, so you have pairs of matching lines. (hint: join())
#find suitable sentences/lines

union_rdd = en_rdd.join(sv_rdd)
union_rdd.cache()

#Pre-process the lines, to split the words as before, but don’t flatten yet.
#Filter to exclude line pairs that have an empty/missing “corresponding” sentence.
#Filter to leave only pairs of sentences with a small number of words per sentence (this
#should give a more reliable translation (you can experiment).
#Filter to leave only pairs of sentences with the same number of words in each sentence.
maximum_len = 3
result = union_rdd\
.map(lambda t: (t[1][0].split(' '),t[1][1].split(' ')))\
.filter(lambda x: x[0][0] != "")\
.filter(lambda x: len(x[0])<maximum_len)\
.filter(lambda t: len(t[0])==len(t[1]))\

result.take(100)


[(['applause'], ['applåder']),
 (['president'], ['talmannen']),
 (['applause'], ['applåder']),
 (['loud', 'applause'], ['livliga', 'applåder']),
 (['sustained', 'applause'], ['ihållande', 'applåder']),
 (['when'], ['när']),
 (['applause'], ['applåder']),
 (['applause'], ['applåder']),
 (['applause'], ['applåder']),
 (['applause'], ['applåder']),
 (['applause'], ['applåder']),
 (['applause'], ['applåder']),
 (['laughter'], ['skratt']),
 (['applause'], ['applåder']),
 (['applause'], ['applåder']),
 (['president'], ['talmannen']),
 (['reg'], ['reg']),
 (['vote'], ['omröstning']),
 (['c'], ['c']),
 (['laughter'], ['skratt']),
 (['applause'], ['applåder']),
 (['applause'], ['applåder']),
 (['applause'], ['applåder']),
 (['bravo'], ['bravo']),
 (['laughter'], ['skratt']),
 (['szabolcs', 'fazakas'], ['szabolcs', 'fazakas']),
 (['rwanda', 'burundi'], ['rwanda', 'burundi']),
 (['applause'], ['applåder']),
 (['vote'], ['omröstning']),
 (['applause'], ['applåder']),
 (['applause'], ['applåder']),

#---------------A.4.1----------------

Yes, they seem reasonable. Many applause but that is most probably because of taking lines with less than 3 words and also just returning the first 100, maybe a bit to short but more secure. 