In [10]:
from pyspark.sql import SparkSession


# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.113:7077") \
        .appName("marcelloVendruscolo_A1")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext
spark_context.setLogLevel("INFO")

In [40]:
#Function to (i) lowercase and (ii) tokenize (split on space) text
def func_lowercase_split(rdd):
    return rdd.lower().split(' ')

In [19]:
#A.1.1 and A.1.4 - Read the English transcripts with Spark, and count the number of lines and partitions.
print("File: europarl-v7.sv-en.en")
file_eng = spark_context.textFile("hdfs://192.168.2.113:9000/europarl/europarl-v7.sv-en.en")
line_count_eng = file_eng.count()
print("Line counting: " + str(line_count_eng))
print("Partition counting: " + str(file_eng.getNumPartitions()))

File: europarl-v7.sv-en.en
Line counting: 1862234
Partition counting: 2


In [52]:
#A.1.2 and A.1.4 - Read the Swedish transcripts with Spark, and count the number of lines and partitions.
print("File: europarl-v7.sv-en.sv")
file_sv = spark_context.textFile("hdfs://192.168.2.113:9000/europarl/europarl-v7.sv-en.sv")
line_count_sv = file_sv.count()
print("Line counting: " + str(line_count_sv))
print("Partition counting: " + str(file_sv.getNumPartitions()))

File: europarl-v7.sv-en.sv
Line counting: 1862234
Partition counting: 3


In [37]:
#A.1.3 - Verify that the line counts are the same for the two languages.
print("The line counts are the same for both europarl-v7.sv-en.en and europarl-v7.sv-en.sv?\n" + str(line_count_eng == line_count_sv))

The line counts are the same for both europarl-v7.sv-en.en and europarl-v7.sv-en.sv?
True


In [55]:
#A.2.1 Pre-process the text from both RDDs by lowercase-ing and tokenize-ing (split on space) the text:
file_eng_lc = file_eng.map(func_lowercase_split)
file_sv_lc = file_sv.map(func_lowercase_split)

In [57]:
#A.2.3 Verify that the line counts still match after the pre-processing.
print("The line counts are the same for europarl-v7.sv-en.en before and after processing?\n" + str(line_count_eng == file_eng_lc.count()))
print("The line counts are the same for europarl-v7.sv-en.sv before and after processing?\n" + str(line_count_sv == file_sv_lc.count()))

The line counts are the same for europarl-v7.sv-en.en before and after processing?
True
The line counts are the same for europarl-v7.sv-en.sv before and after processing?
True


In [9]:
# release the cores for another application!
spark_context.stop()