In [1]:
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.87:7077") \
        .appName("Daniel_Agstrand_A3_Part1")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()
        
# Old API (RDD)
spark_context = spark_session.sparkContext



In [2]:
# Q A.1.1

rdd_en = spark_context.newAPIHadoopFile(
    'hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.en',
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text'
)\
.cache() # Keep this RDD in memory!

line_count_en = rdd_en.count()

In [47]:
test = sc.parallelize(rdd_en)

NameError: name 'sc' is not defined

In [3]:
# Q A.1.2

rdd_sv = spark_context.newAPIHadoopFile(
    'hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.sv',
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text'
)\
.cache() # Keep this RDD in memory!

line_count_sv = rdd_sv.count()

In [4]:
# Q A.1.3

if line_count_sv == line_count_en:
    print ("Equal amount of lines! Number of lines: {}".format(line_count_en))
else:
    print ("Unequal amount of lines! Number of lines in en-sv = {} and Number of lines in sv-en = {}"\
           .format(line_count_en, line_count_sv))

Equal amount of lines! Number of lines: 1862234


In [5]:
# Q A.1.4

rdd_en_partitions_count = rdd_en.getNumPartitions()
rdd_sv_partitions_count = rdd_sv.getNumPartitions()

total_partitions_count = rdd_en_partitions_count + rdd_sv_partitions_count

print("Number of partions in rdd_en: {}\nNumber of partions in rdd_sv: {}\nNumber of total partions in all rdd: {}"\
      .format(rdd_en_partitions_count, rdd_sv_partitions_count, total_partitions_count))

Number of partions in rdd_en: 2
Number of partions in rdd_sv: 3
Number of total partions in all rdd: 5


In [41]:
# Q A.2.1
from operator import add

def lowercase_split(line):
    line = str(line[1]).lower().split(" ")
    return line

rdd_en_mapped = rdd_en.map(lowercase_split) 
rdd_sv_mapped = rdd_sv.map(lowercase_split)

In [45]:
# Q A.2.1

print("10 entries in rdd_en_mapped: " + str(rdd_en_mapped.take(10)) + "\n10 entries in rdd_vs_mapped: " + str(rdd_sv_mapped.take(10)))

10 entries in rdd_en_mapped: [['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'request

In [44]:
line_count_en_mapped = rdd_en_mapped.count()
line_count_sv_mapped = rdd_sv_mapped.count()

In [46]:
# Q A.2.2

#print(line_count_en_mapped)
#print(line_count_en)
#print(line_count_sv_mapped)
#print(line_count_sv)

if ((line_count_en_mapped == line_count_en) & (line_count_sv_mapped == line_count_sv)):
    print("Line count still match!")
else:
    print("Line count doesnt match any more!")

Line count still match!


In [10]:
#spark_session.stop()

In [49]:
# Q A.3.1
from operator import add

wordCounts_en = rdd_en_mapped.flatMap(lambda x: x).map(lambda word: (word, 1)).reduceByKey(add)
wordCounts_sv = rdd_sv_mapped.flatMap(lambda x: x).map(lambda word: (word, 1)).reduceByKey(add)

In [51]:
# Q A.3.2

print("10 most common words in the english text: {}".format(wordCounts_en.takeOrdered(10, key=lambda x: -x[1])))
print("10 most common words in the swedish text: {}".format(wordCounts_sv.takeOrdered(10, key=lambda x: -x[1])))

10 most common words in the english text: [('the', 3498375), ('of', 1659758), ('to', 1539760), ('and', 1288401), ('in', 1085993), ('that', 797516), ('a', 773522), ('is', 758050), ('for', 534242), ('we', 522849)]
10 most common words in the swedish text: [('att', 1706293), ('och', 1344830), ('i', 1050774), ('det', 924866), ('som', 913276), ('för', 908680), ('av', 738068), ('är', 694381), ('en', 620310), ('vi', 539797)]


In [83]:
# Q A.4.1

rdd_en_1 = rdd_en_mapped.zipWithIndex()
rdd_sv_1 = rdd_sv_mapped.zipWithIndex()

[(['resumption', 'of', 'the', 'session'], 0),
 (['i',
   'declare',
   'resumed',
   'the',
   'session',
   'of',
   'the',
   'european',
   'parliament',
   'adjourned',
   'on',
   'friday',
   '17',
   'december',
   '1999,',
   'and',
   'i',
   'would',
   'like',
   'once',
   'again',
   'to',
   'wish',
   'you',
   'a',
   'happy',
   'new',
   'year',
   'in',
   'the',
   'hope',
   'that',
   'you',
   'enjoyed',
   'a',
   'pleasant',
   'festive',
   'period.'],
  1),
 (['although,',
   'as',
   'you',
   'will',
   'have',
   'seen,',
   'the',
   'dreaded',
   "'millennium",
   "bug'",
   'failed',
   'to',
   'materialise,',
   'still',
   'the',
   'people',
   'in',
   'a',
   'number',
   'of',
   'countries',
   'suffered',
   'a',
   'series',
   'of',
   'natural',
   'disasters',
   'that',
   'truly',
   'were',
   'dreadful.'],
  2),
 (['you',
   'have',
   'requested',
   'a',
   'debate',
   'on',
   'this',
   'subject',
   'in',
   'the',
   'course',
  

In [86]:
# Q A.4.2

rdd_en_2 = rdd_en_1.map(lambda x: (x[1], x[0]))
rdd_sv_2 = rdd_sv_1.map(lambda x: (x[1], x[0]))

[(0, ['resumption', 'of', 'the', 'session']),
 (1,
  ['i',
   'declare',
   'resumed',
   'the',
   'session',
   'of',
   'the',
   'european',
   'parliament',
   'adjourned',
   'on',
   'friday',
   '17',
   'december',
   '1999,',
   'and',
   'i',
   'would',
   'like',
   'once',
   'again',
   'to',
   'wish',
   'you',
   'a',
   'happy',
   'new',
   'year',
   'in',
   'the',
   'hope',
   'that',
   'you',
   'enjoyed',
   'a',
   'pleasant',
   'festive',
   'period.']),
 (2,
  ['although,',
   'as',
   'you',
   'will',
   'have',
   'seen,',
   'the',
   'dreaded',
   "'millennium",
   "bug'",
   'failed',
   'to',
   'materialise,',
   'still',
   'the',
   'people',
   'in',
   'a',
   'number',
   'of',
   'countries',
   'suffered',
   'a',
   'series',
   'of',
   'natural',
   'disasters',
   'that',
   'truly',
   'were',
   'dreadful.']),
 (3,
  ['you',
   'have',
   'requested',
   'a',
   'debate',
   'on',
   'this',
   'subject',
   'in',
   'the',
   'course

In [87]:
# Q A.4.3

rdd_en_sv_1 = rdd_en_2.join(rdd_sv_2).map(lambda x: (x[1]))

[(['i',
   'would',
   'like',
   'to',
   'say',
   'that',
   'as',
   'regards',
   'the',
   'status',
   'of',
   'those',
   'countries',
   'which',
   'are',
   'not',
   'members',
   'of',
   'the',
   'european',
   'union',
   'but',
   'which',
   'are',
   'members',
   'of',
   'the',
   'alliance',
   'and',
   'eventually',
   'will',
   'be',
   'members',
   'of',
   'the',
   'european',
   'union,',
   'we',
   'would',
   'like',
   'to',
   'get',
   'them',
   'involved.'],
  ['efter',
   'det',
   'samtal',
   'som',
   'vi',
   'har',
   'haft',
   'med',
   'kommissionen',
   'är',
   'det',
   'en',
   'bra',
   'tanke',
   'att',
   'gå',
   'vidare',
   'i',
   'den',
   'riktningen,',
   'men',
   'jag',
   'tror',
   'att',
   'vi',
   'har',
   'en',
   'lång',
   'väg',
   'kvar',
   'till',
   'en',
   'diplomatisk',
   'kår',
   'som',
   'sådan.']),
 (['this',
   'is',
   'why',
   'the',
   'group',
   'of',
   'the',
   'european',
   'liberal,',


In [93]:
# Q A.4.4

rdd_en_sv_2 = rdd_en_sv_1.filter(lambda x: len(x[0])>0)

In [95]:
# Q A.4.5

rdd_en_sv_3 = rdd_en_sv_2.filter(lambda x: len(x[0])<10)

[(['the',
   'member',
   'states',
   'are',
   'still',
   'insufficiently',
   'aware',
   'of',
   'the',
   'fraud',
   'issue.'],
  ['medlemsstaterna',
   'är',
   'fortfarande',
   'inte',
   'tillräckligt',
   'medvetna',
   'om',
   'bedrägeriproblemet.']),
 (['mr',
   'hager',
   'has',
   'shown',
   'that',
   'very',
   'clearly.',
   'these',
   'standards',
   'must',
   'be',
   'laid',
   'down',
   'in',
   'the',
   'description',
   'of',
   'the',
   'product.'],
  ['kollegan', 'hager', 'har', 'förklarat', 'det', 'mycket', 'tydligt.']),
 (['this', 'power', 'must', 'not', 'be', 'relinquished.'],
  ['den', 'befogenheten', 'får', 'vi', 'inte', 'låta', 'gå', 'förlorad.']),
 (['the', 'vote', 'will', 'take', 'place', 'tomorrow', 'at', '12', 'noon.'],
  ['omröstningen',
   'kommer',
   'att',
   'äga',
   'rum',
   'i',
   'morgon',
   'kl.',
   '12.00.']),
 (['the', 'package', 'consists', 'of', 'amendment', 'nos', '19-26.'],
  ['paketet', 'består', 'av', 'ändringsförslag

In [96]:
# Q A.4.6

rdd_en_sv_4 = rdd_en_sv_3.filter(lambda x: len(x[0])==len(x[1]))

[(['the', 'question', 'is:', 'who', 'prosecutes?'],
  ['frågan', 'är:', 'vem', 'skall', 'åtala?']),
 (['most', 'of', 'the', 'report', 'is', 'about', 'financial', 'control.'],
  ['största',
   'delen',
   'av',
   'betänkandet',
   'handlar',
   'om',
   'ekonomisk',
   'kontroll.']),
 (['securing', 'this', 'is', 'a', 'major', 'priority.'],
  ['att', 'garantera', 'denna', 'är', 'en', 'huvudfråga.']),
 ([''], ['.']),
 (['everyone', 'was', 'quite', 'dumbfounded', 'by', 'this', 'attitude.'],
  ['alla', 'blev', 'verkligen', 'förvånade', 'över', 'denna', 'attityd.']),
 ([''], ['frågor.']),
 (['the', 'debate', 'is', 'closed.'],
  ['jag', 'förklarar', 'debatten', 'avslutad.']),
 (['that', 'is', 'feasible,', 'in', 'any', 'case.'],
  ['genomförbart', 'är', 'det', 'i', 'varje', 'fall.']),
 ([''], ['.']),
 (['i', 'see', 'that', 'you', 'agree', 'to', 'this', 'suggestion.'],
  ['(applåder)', 'jag', 'ser', 'att', 'ni', 'instämmer', 'i', 'förslaget.'])]

In [110]:
# Q A.4.7

rdd_en_sv_5  = rdd_en_sv_4.map(lambda x: list(zip(x[0], x[1])))                         

[[('it', 'den'),
  ('also', 'hjälper'),
  ('helps', 'också'),
  ('the', 'till'),
  ('national', 'att'),
  ('identity', 'bevara'),
  ('to', 'den'),
  ('be', 'nationella'),
  ('preserved.', 'identiteten.')],
 [('other', 'det'),
  ('projects', 'finns'),
  ('are', 'också'),
  ('still', 'andra'),
  ('in', 'projekt'),
  ('the', 'under'),
  ('pipeline.', 'planering.')],
 [('applause', 'applåder')],
 [('the', 'de'),
  ('amendments', 'framlagda'),
  ('tabled', 'ändringsförslagen'),
  ('are', 'utgör'),
  ('therefore', 'därför'),
  ('a', 'en'),
  ('significant', 'betydande'),
  ('improvement.', 'förbättring.')],
 [('the', 'det'), ('first', 'första'), ('is', 'är'), ('safety.', 'säkerhet.')]]

In [111]:
# Q A.4.7

rdd_en_sv_6 = rdd_en_sv_5.flatMap(lambda x: x).map(lambda word: (word, 1)).reduceByKey(add)
rdd_en_sv_6.takeOrdered(10, key=lambda x: -x[1])

[(('is', 'är'), 10040),
 (('we', 'vi'), 5530),
 (('i', 'jag'), 5020),
 (('this', 'detta'), 3252),
 (('closed.', 'avslutad.'), 2964),
 (('and', 'och'), 2917),
 (('a', 'en'), 2888),
 (('it', 'det'), 2866),
 (('that', 'det'), 2806),
 (('not', 'inte'), 2650)]