In [1]:
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.87:7077") \
        .appName("Daniel_Agstrand_A3_Part1")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext



In [2]:
# Q A.1.1

rdd_en = spark_context.newAPIHadoopFile(
    'hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.en',
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text'
)\
.cache() # Keep this RDD in memory!

line_count_en = rdd_en.count()

In [3]:
# Q A.1.2

rdd_sv = spark_context.newAPIHadoopFile(
    'hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.sv',
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text'
)\
.cache() # Keep this RDD in memory!

line_count_sv = rdd_sv.count()

In [4]:
# Q A.1.3

if line_count_sv == line_count_en:
    print ("Equal amount of lines! Number of lines: {}".format(line_count_en))
else:
    print ("Unequal amount of lines! Number of lines in en-sv = {} and Number of lines in sv-en = {}"\
           .format(line_count_en, line_count_sv))

Equal amount of lines! Number of lines: 1862234


In [5]:
# Q A.1.4

rdd_en_partitions_count = rdd_en.getNumPartitions()
rdd_sv_partitions_count = rdd_sv.getNumPartitions()

total_partitions_count = rdd_en_partitions_count + rdd_sv_partitions_count

print("Number of partions in rdd_en: {}\nNumber of partions in rdd_sv: {}\nNumber of total partions in all rdd: {}"\
      .format(rdd_en_partitions_count, rdd_sv_partitions_count, total_partitions_count))

Number of partions in rdd_en: 2
Number of partions in rdd_sv: 3
Number of total partions in all rdd: 5


In [41]:
# Q A.2.1
from operator import add

def lowercase_split(line):
    line = str(line[1]).lower().split(" ")
    return line

rdd_en_mapped = rdd_en.map(lowercase_split) 
rdd_sv_mapped = rdd_sv.map(lowercase_split)

In [45]:
# Q A.2.1

print("10 entries in rdd_en_mapped: " + str(rdd_en_mapped.take(10)) + "\n10 entries in rdd_vs_mapped: " + str(rdd_sv_mapped.take(10)))

10 entries in rdd_en_mapped: [['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'request

In [44]:
line_count_en_mapped = rdd_en_mapped.count()
line_count_sv_mapped = rdd_sv_mapped.count()

In [46]:
# Q A.2.2

#print(line_count_en_mapped)
#print(line_count_en)
#print(line_count_sv_mapped)
#print(line_count_sv)

if ((line_count_en_mapped == line_count_en) & (line_count_sv_mapped == line_count_sv)):
    print("Line count still match!")
else:
    print("Line count doesnt match any more!")

Line count still match!


In [10]:
#spark_session.stop()

In [42]:
# Q A.3.1
from operator import add

wordCounts_en = rdd_en_mapped.flatMap(lambda x: x).map(lambda word: (word, 1)).reduceByKey(add)

[('the', 3498375),
 ('of', 1659758),
 ('to', 1539760),
 ('and', 1288401),
 ('in', 1085993),
 ('that', 797516),
 ('a', 773522),
 ('is', 758050),
 ('for', 534242),
 ('we', 522849)]

In [43]:
# Q A.3.2

wordCounts_en.takeOrdered(10, key=lambda x: -x[1])

[('the', 3498375),
 ('of', 1659758),
 ('to', 1539760),
 ('and', 1288401),
 ('in', 1085993),
 ('that', 797516),
 ('a', 773522),
 ('is', 758050),
 ('for', 534242),
 ('we', 522849)]

In [None]:
# Q A.4.1

