In [2]:
from pyspark.sql import SparkSession

#New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.113:7077") \
        .appName("marcelloVendruscolo_A1")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

#Old API (RDD)
spark_context = spark_session.sparkContext
spark_context.setLogLevel("INFO")

In [3]:
#Function to (i) lowercase and (ii) tokenize (split on space) text
def func_lowercase_split(rdd):
    return rdd.lower().split(' ')

In [4]:
#A.1.1 and A.1.4 - Read the English transcripts with Spark, and count the number of lines and partitions.
print("File: europarl-v7.sv-en.en")
en_1 = spark_context.textFile("hdfs://192.168.2.113:9000/europarl/europarl-v7.sv-en.en")
lineCount_en_1 = en_1.count()
print("Line counting: " + str(lineCount_en_1))
print("Partition counting: " + str(en_1.getNumPartitions()))

File: europarl-v7.sv-en.en
Line counting: 1862234
Partition counting: 2


In [5]:
#A.1.2 and A.1.4 - Read the Swedish transcripts with Spark, and count the number of lines and partitions.
print("File: europarl-v7.sv-en.sv")
sv_1 = spark_context.textFile("hdfs://192.168.2.113:9000/europarl/europarl-v7.sv-en.sv")
lineCount_sv_1 = sv_1.count()
print("Line counting: " + str(lineCount_sv_1))
print("Partition counting: " + str(sv_1.getNumPartitions()))

File: europarl-v7.sv-en.sv
Line counting: 1862234
Partition counting: 3


In [6]:
#A.1.3 - Verify that the line counts are the same for the two languages.
print("The line counts are the same for both europarl-v7.sv-en.en and europarl-v7.sv-en.sv?\n" + str(lineCount_en_1 == lineCount_sv_1))

The line counts are the same for both europarl-v7.sv-en.en and europarl-v7.sv-en.sv?
True


In [7]:
#A.2.1 - Preprocess the text from both RDDs by lowercase-ing and tokenize-ing (split on space) the text:
en_2 = en_1.map(func_lowercase_split)
sv_2 = sv_1.map(func_lowercase_split)

In [8]:
#A.2.3 Verify that the line counts still match after the pre-processing.
print("The line counts are the same for europarl-v7.sv-en.en before and after processing?\n" + str(lineCount_en_1 == en_2.count()))
print("The line counts are the same for europarl-v7.sv-en.sv before and after processing?\n" + str(lineCount_sv_1 == sv_2.count()))

The line counts are the same for europarl-v7.sv-en.en before and after processing?
True
The line counts are the same for europarl-v7.sv-en.sv before and after processing?
True


In [9]:
#A.3.1 - Use Spark to compute the 10 most frequently according words in the English and Swedish language corpus.
print(en_2.flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).takeOrdered(10, key = lambda x: -x[1]))
print(sv_2.flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).takeOrdered(10, key = lambda x: -x[1]))

[('the', 3498375), ('of', 1659758), ('to', 1539760), ('and', 1288401), ('in', 1085993), ('that', 797516), ('a', 773522), ('is', 758050), ('for', 534242), ('we', 522849)]
[('att', 1706293), ('och', 1344830), ('i', 1050774), ('det', 924866), ('som', 913276), ('för', 908680), ('av', 738068), ('är', 694381), ('en', 620310), ('vi', 539797)]


In [50]:
#A.4.1 - Use this parallel corpus to mine some translations in the form of word pairs, for the two languages.
en_3 = en_2.zipWithIndex().map(lambda x: (x[1],x[0]))
sv_3 = sv_2.zipWithIndex().map(lambda x: (x[1],x[0]))
en_sv = en_3.join(sv_3)

In [51]:
en_sv = en_sv.filter(lambda x: (not x[1][0] is None) and (not x[1][1] is None))
en_sv = en_sv.filter(lambda x: (len(x[1][0]) != 0) and (len(x[1][1]) != 0))
en_sv = en_sv.filter(lambda x: ((len(x[1][0]) <= 8) and (len(x[1][1]) <= 8)))
en_sv = en_sv.filter(lambda x: (len(x[1][0]) == len(x[1][1])))
print(en_sv.take(5))

[(707940, (['\xa0\xa0', '.'], ['\xa0\xa0', '.'])), (24055, (['so', 'why', 'change', 'it?'], ['varför', 'ändra', 'på', 'den?'])), (140210, (['we', 'should', 'examine', 'these', 'aspects', 'further.'], ['vi', 'måste', 'undersöka', 'det', 'hela', 'noggrannare.'])), (462540, (['these', 'are', 'technical', 'rather', 'than', 'substantive', 'amendments.'], ['dessa', 'ändringsförslag', 'avser', 'snarare', 'formen', 'än', 'innehållet.'])), (284555, ([''], ['.']))]


In [None]:
en_sv_test = en_sv.map(lambda x: x[1])
en_sv_test.take(3)

In [15]:
# release the cores for another application!
#spark_context.stop()