In [1]:
from pyspark.sql import SparkSession

#New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.113:7077") \
        .appName("marcelloVendruscolo_Assignment3_pA")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

#Old API (RDD)
spark_context = spark_session.sparkContext
spark_context.setLogLevel("INFO")

In [2]:
#Helper function to (i) lowercase and (ii) tokenize (split on space) text.
def func_lowercase_split(rdd):
    return rdd.lower().split(' ')

In [3]:
#A.1.1 and A.1.4 - Read the English transcripts with Spark, and count the number of lines and partitions.
print("File: europarl-v7.sv-en.en\n")
en_1 = spark_context.textFile("hdfs://192.168.2.113:9000/europarl/europarl-v7.sv-en.en")
lineCount_en_1 = en_1.count()
print("Line countig: " + str(lineCount_en_1))
print("Partition counting: " + str(en_1.getNumPartitions()))

File: europarl-v7.sv-en.en

Line countig: 1862234
Partition counting: 2


In [4]:
#A.1.2 and A.1.4 - Read the Swedish transcripts with Spark, and count the number of lines and partitions.
print("File: europarl-v7.sv-en.sv\n")
sv_1 = spark_context.textFile("hdfs://192.168.2.113:9000/europarl/europarl-v7.sv-en.sv")
lineCount_sv_1 = sv_1.count()
print("Line counting: " + str(lineCount_sv_1))
print("Partition counting: " + str(sv_1.getNumPartitions()))

File: europarl-v7.sv-en.sv

Line counting: 1862234
Partition counting: 3


In [5]:
#A.1.3 - Verify that the line counts are the same for the two languages.
print("The line counts are the same for both europarl-v7.sv-en.en and europarl-v7.sv-en.sv: " + str(lineCount_en_1 == lineCount_sv_1))

The line counts are the same for both europarl-v7.sv-en.en and europarl-v7.sv-en.sv: True


In [6]:
#A.2.1 - Preprocess the text from both RDDs by lowercase-ing and tokenize-ing (split on space) the text:
en_2 = en_1.map(func_lowercase_split)
sv_2 = sv_1.map(func_lowercase_split)

In [7]:
#A.2.2 - Inspect 10 entries from each of your RDDs to verify your pre-processing.
print("10 entries from the English corpus after pre-processing:\n")
print(en_2.take(10))
print("\n10 entries from the Swedish corpus after pre-processing:\n")
print(sv_2.take(10))

10 entries from the English corpus after pre-processing:

[['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of'

In [8]:
#A.2.3 Verify that the line counts still match after the pre-processing.
lineCount_en_2 = en_2.count()
lineCount_sv_2 = sv_2.count()
print("The line counts are the same for europarl-v7.sv-en.en before and after processing: " + str(lineCount_en_1 == lineCount_en_2))
print("The line counts are the same for europarl-v7.sv-en.sv before and after processing: " + str(lineCount_sv_1 == lineCount_sv_2))
print("The line counts are the same for both europarl-v7.sv-en.en and europarl-v7.sv-en.sv after processing: " + str(lineCount_en_2 == lineCount_sv_2))

The line counts are the same for europarl-v7.sv-en.en before and after processing: True
The line counts are the same for europarl-v7.sv-en.sv before and after processing: True
The line counts are the same for both europarl-v7.sv-en.en and europarl-v7.sv-en.sv after processing: True


In [9]:
#A.3.1 - Use Spark to compute the 10 most frequently according words in the English and Swedish language corpus.
print("The 10 most frequent words in the English corpus:\n")
print(en_2.flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).takeOrdered(10, key = lambda x: -x[1]))
print("\nThe 10 most frequent words in the Swedish corpus:\n")
print(sv_2.flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).takeOrdered(10, key = lambda x: -x[1]))

The 10 most frequent words in the English corpus:

[('the', 3498375), ('of', 1659758), ('to', 1539760), ('and', 1288401), ('in', 1085993), ('that', 797516), ('a', 773522), ('is', 758050), ('for', 534242), ('we', 522849)]

The 10 most frequent words in the Swedish corpus:

[('att', 1706293), ('och', 1344830), ('i', 1050774), ('det', 924866), ('som', 913276), ('för', 908680), ('av', 738068), ('är', 694381), ('en', 620310), ('vi', 539797)]


In [10]:
#A.4.1 - Use this parallel corpus to mine some translations in the form of word pairs, for the two languages.
en_3 = en_2.zipWithIndex().map(lambda x: (x[1],x[0]))
sv_3 = sv_2.zipWithIndex().map(lambda x: (x[1],x[0]))
en_sv_0 = en_3.join(sv_3)
en_sv_1 = en_sv_0.filter(lambda x: (not x[1][0] is None) and (not x[1][1] is None)) #line pairs that have an empty/missing “corresponding” sentence.
en_sv_2 = en_sv_1.filter(lambda x: ((len(x[1][0]) <= 15) and (len(x[1][1]) <= 15)))
en_sv_3 = en_sv_2.filter(lambda x: ((len(x[1][0]) >= 2) and (len(x[1][1]) >= 2))) #filter out sentences that are too short.
en_sv_4 = en_sv_3.filter(lambda x: (len(x[1][0]) == len(x[1][1])))
en_sv = en_sv_4.map(lambda x: list(zip(x[1][0],x[1][1]))).flatMap(lambda x: x)
print("Some of the most frequently occurring pairs of words:\n")
print(en_sv.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).takeOrdered(20, key = lambda x: -x[1]))

Some of the most frequently occurring pairs of words:

[(('is', 'är'), 16331), (('we', 'vi'), 11983), (('i', 'jag'), 11451), (('and', 'och'), 9819), (('a', 'en'), 6536), (('it', 'det'), 6360), (('this', 'detta'), 6125), (('in', 'i'), 5974), (('not', 'inte'), 5285), (('to', 'att'), 4926), (('that', 'att'), 4266), (('the', 'den'), 4137), (('a', 'ett'), 4084), (('that', 'det'), 3959), (('have', 'har'), 3943), (('of', 'av'), 3861), (('for', 'för'), 3780), (('the', 'det'), 3336), (('are', 'är'), 3166), (('there', 'det'), 3032)]


In [11]:
#Release the cores for another application!
spark_context.stop()