In [1]:
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.87:7077") \
        .appName("Daniel_Agstrand_A3_SectionA")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()
        
# Old API (RDD)
spark_context = spark_session.sparkContext

In [2]:
# Q A.1.1

rdd_en = spark_context.newAPIHadoopFile(
    'hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.en',
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text'
)\
.cache() # Keep this RDD in memory!

line_count_en = rdd_en.count()

In [3]:
# Q A.1.2

rdd_sv = spark_context.newAPIHadoopFile(
    'hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.sv',
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text'
)\
.cache() # Keep this RDD in memory!

line_count_sv = rdd_sv.count()

In [4]:
# Q A.1.3

if line_count_sv == line_count_en:
    print ("Equal amount of lines! Number of lines: {}".format(line_count_en))
else:
    print ("Unequal amount of lines! Number of lines in en-sv = {} and Number of lines in sv-en = {}"\
           .format(line_count_en, line_count_sv))

Equal amount of lines! Number of lines: 1862234


In [5]:
# Q A.1.4

rdd_en_partitions_count = rdd_en.getNumPartitions()
rdd_sv_partitions_count = rdd_sv.getNumPartitions()

total_partitions_count = rdd_en_partitions_count + rdd_sv_partitions_count

print("Number of partions in rdd_en: {}\nNumber of partions in rdd_sv: {}\nNumber of total partions in all rdd: {}"\
      .format(rdd_en_partitions_count, rdd_sv_partitions_count, total_partitions_count))

Number of partions in rdd_en: 2
Number of partions in rdd_sv: 3
Number of total partions in all rdd: 5


In [6]:
# Q A.2.1
from operator import add

def lowercase_split(line):
    line = str(line[1]).lower().split(" ")
    return line

rdd_en_mapped = rdd_en.map(lowercase_split) 
rdd_sv_mapped = rdd_sv.map(lowercase_split)

In [7]:
# Q A.2.1

print("10 entries in rdd_en_mapped: " + str(rdd_en_mapped.take(10)) + "\n10 entries in rdd_vs_mapped: " + str(rdd_sv_mapped.take(10)))

10 entries in rdd_en_mapped: [['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'request

In [8]:
# Q A.2.2

line_count_en_mapped = rdd_en_mapped.count()
line_count_sv_mapped = rdd_sv_mapped.count()

if ((line_count_en_mapped == line_count_en) & (line_count_sv_mapped == line_count_sv)):
    print("Line count still match!")
else:
    print("Line count doesnt match any more!")

Line count still match!


In [9]:
# Q A.3.1
from operator import add

wordCounts_en = rdd_en_mapped.flatMap(lambda x: x).map(lambda word: (word, 1)).reduceByKey(add)
wordCounts_sv = rdd_sv_mapped.flatMap(lambda x: x).map(lambda word: (word, 1)).reduceByKey(add)

In [10]:
# Q A.3.2

print("10 most common words in the english text: {}".format(wordCounts_en.takeOrdered(10, key=lambda x: -x[1])))
print("10 most common words in the swedish text: {}".format(wordCounts_sv.takeOrdered(10, key=lambda x: -x[1])))

10 most common words in the english text: [('the', 3498375), ('of', 1659758), ('to', 1539760), ('and', 1288401), ('in', 1085993), ('that', 797516), ('a', 773522), ('is', 758050), ('for', 534242), ('we', 522849)]
10 most common words in the swedish text: [('att', 1706293), ('och', 1344830), ('i', 1050774), ('det', 924866), ('som', 913276), ('för', 908680), ('av', 738068), ('är', 694381), ('en', 620310), ('vi', 539797)]


In [11]:
# Q A.4.1

rdd_en_1 = rdd_en_mapped.zipWithIndex()
rdd_sv_1 = rdd_sv_mapped.zipWithIndex()

rdd_en_1.take(5)

[(['resumption', 'of', 'the', 'session'], 0),
 (['i',
   'declare',
   'resumed',
   'the',
   'session',
   'of',
   'the',
   'european',
   'parliament',
   'adjourned',
   'on',
   'friday',
   '17',
   'december',
   '1999,',
   'and',
   'i',
   'would',
   'like',
   'once',
   'again',
   'to',
   'wish',
   'you',
   'a',
   'happy',
   'new',
   'year',
   'in',
   'the',
   'hope',
   'that',
   'you',
   'enjoyed',
   'a',
   'pleasant',
   'festive',
   'period.'],
  1),
 (['although,',
   'as',
   'you',
   'will',
   'have',
   'seen,',
   'the',
   'dreaded',
   "'millennium",
   "bug'",
   'failed',
   'to',
   'materialise,',
   'still',
   'the',
   'people',
   'in',
   'a',
   'number',
   'of',
   'countries',
   'suffered',
   'a',
   'series',
   'of',
   'natural',
   'disasters',
   'that',
   'truly',
   'were',
   'dreadful.'],
  2),
 (['you',
   'have',
   'requested',
   'a',
   'debate',
   'on',
   'this',
   'subject',
   'in',
   'the',
   'course',
  

In [12]:
# Q A.4.2

rdd_en_2 = rdd_en_1.map(lambda x: (x[1], x[0]))
rdd_sv_2 = rdd_sv_1.map(lambda x: (x[1], x[0]))

rdd_en_2.take(5)

[(0, ['resumption', 'of', 'the', 'session']),
 (1,
  ['i',
   'declare',
   'resumed',
   'the',
   'session',
   'of',
   'the',
   'european',
   'parliament',
   'adjourned',
   'on',
   'friday',
   '17',
   'december',
   '1999,',
   'and',
   'i',
   'would',
   'like',
   'once',
   'again',
   'to',
   'wish',
   'you',
   'a',
   'happy',
   'new',
   'year',
   'in',
   'the',
   'hope',
   'that',
   'you',
   'enjoyed',
   'a',
   'pleasant',
   'festive',
   'period.']),
 (2,
  ['although,',
   'as',
   'you',
   'will',
   'have',
   'seen,',
   'the',
   'dreaded',
   "'millennium",
   "bug'",
   'failed',
   'to',
   'materialise,',
   'still',
   'the',
   'people',
   'in',
   'a',
   'number',
   'of',
   'countries',
   'suffered',
   'a',
   'series',
   'of',
   'natural',
   'disasters',
   'that',
   'truly',
   'were',
   'dreadful.']),
 (3,
  ['you',
   'have',
   'requested',
   'a',
   'debate',
   'on',
   'this',
   'subject',
   'in',
   'the',
   'course

In [13]:
# Q A.4.3

rdd_en_sv_1 = rdd_en_2.join(rdd_sv_2).map(lambda x: (x[1]))

rdd_en_sv_1.take(5)

[(['in',
   'setting',
   'up',
   'numerous',
   'hurdles',
   'their',
   'sole',
   'aim',
   'is',
   'to',
   'hamper',
   'an',
   'efficient',
   'ecological',
   'material',
   'flow',
   'policy',
   'and',
   'by',
   'going',
   'down',
   'the',
   'road',
   'of',
   'type',
   'approvals',
   'they',
   'seek',
   'to',
   'delay',
   'the',
   'implementation',
   'of',
   'the',
   'directives',
   'by',
   '12',
   'years',
   'or',
   'more.'],
  ['genom',
   'uppbyggnaden',
   'av',
   'talrika',
   'hinder',
   'vill',
   'de',
   'ingenting',
   'annat',
   'än',
   'att',
   'förhindra',
   'en',
   'effektiv',
   'ekologisk',
   'politik',
   'för',
   'avfallsströmmar,',
   'och',
   'via',
   'typgodkännanden',
   'vill',
   'de',
   'fördröja',
   'tillämpningen',
   'av',
   'direktiven',
   'med',
   'tolv',
   'år',
   'eller',
   'mer.']),
 (['the',
   'entry',
   'into',
   'force',
   'of',
   'the',
   'ottawa',
   'convention',
   'a',
   'year',
   'a

In [14]:
# Q A.4.4

rdd_en_sv_2 = rdd_en_sv_1.filter(lambda x: len(x[0])>0)

rdd_en_sv_2.take(5)

[(['what',
   'action',
   'has',
   'the',
   'council',
   'taken',
   'to',
   'encourage',
   'the',
   'development',
   'of',
   'political',
   'relations',
   'between',
   'armenia',
   'and',
   'turkey?'],
  ['vilka',
   'åtgärder',
   'har',
   'rådet',
   'vidtagit',
   'för',
   'att',
   'främja',
   'utvecklingen',
   'av',
   'politiska',
   'förbindelser',
   'mellan',
   'armenien',
   'och',
   'turkiet?']),
 (['is', 'that', 'not', 'so,', 'commissioner?'],
  ['eller', 'hur', 'fru', 'kommissionär?']),
 (['next,',
   'what',
   'form',
   'can',
   'a',
   'universal',
   'service',
   'take',
   'nowadays?'],
  ['när',
   'det',
   'sedan',
   'gäller',
   'samhällsomfattande',
   'tjänster,',
   'vilken',
   'form',
   'kan',
   'de',
   'ha',
   'i',
   'dag?']),
 (['this',
   'does',
   'not',
   'mean',
   'that',
   'there',
   'is',
   'not',
   'a',
   'strong',
   'will',
   'to',
   'move',
   'forward.'],
  ['det',
   'betyder',
   'inte',
   'att',
   'det

In [15]:
# Q A.4.5

rdd_en_sv_3 = rdd_en_sv_2.filter(lambda x: len(x[0])<10)

rdd_en_sv_3.take(5)

[(['i', 'have', 'a', 'number', 'of', 'concerns.'],
  ['jag', 'bekymrar', 'mig', 'för', 'en', 'rad', 'saker.']),
 (['the', 'third', 'point', 'concerns', 'the', 'issue', 'of', 'unanimity.'],
  ['den', 'tredje', 'punkten', 'gäller', 'frågan', 'om', 'enhällighet.']),
 (['commissioner,', 'thank', 'you', 'for', 'your', 'contribution.'],
  ['herr', 'kommissionär!', 'tack', 'för', 'ert', 'inlägg.']),
 (['perhaps', 'we', 'should', 'no', 'longer', 'do', 'that', 'so', 'often.'],
  ['kanske', 'borde', 'man', 'inte', 'längre', 'göra', 'det', 'så', 'ofta.']),
 (['the', 'regulation', 'is', 'an', 'initial', 'response', 'to', 'this.'],
  [''])]

In [16]:
# Q A.4.6

rdd_en_sv_4 = rdd_en_sv_3.filter(lambda x: len(x[0])==len(x[1]))

rdd_en_sv_4.take(5)

[(['however,', 'two', 'problems', 'emerge', 'from', 'this', 'situation.'],
  ['denna', 'situation', 'leder', 'dock', 'till', 'två', 'problem.']),
 (['madam', 'president,', 'i', 'am', 'against', 'this', 'oral', 'amendment.'],
  ['fru',
   'talman!',
   'jag',
   'är',
   'emot',
   'detta',
   'muntliga',
   'ändringsförslag.']),
 (['we',
   'include',
   'social',
   'and',
   'environmental',
   'sustainability',
   'in',
   'competitiveness.'],
  ['i',
   'konkurrenskraft',
   'inkluderar',
   'vi',
   'social',
   'och',
   'miljömässig',
   'hållbarhet.']),
 (['(laughter)'], ['(skratt)']),
 (['why?'], ['varför?'])]

In [17]:
# Q A.4.7

rdd_en_sv_5  = rdd_en_sv_4.map(lambda x: list(zip(x[0], x[1])))

rdd_en_sv_5.take(5)

[[('it', 'det'),
  ('is', 'är'),
  ('the', 'ett'),
  ('first', 'första'),
  ('step.', 'steg.')],
 [('now', 'i'),
  ('it', 'dag'),
  ('is', 'är'),
  ('the', 'det'),
  ('euro.', 'euron.')],
 [('there', 'det'),
  ('are', 'finns'),
  ('countless', 'oräkneliga'),
  ('reports', 'betänkanden'),
  ('on', 'som'),
  ('this', 'berör'),
  ('subject.', 'detta.')],
 [('it', 'det'),
  ('is', 'är'),
  ('important', 'viktigt'),
  ('to', 'att'),
  ('distinguish', 'skilja'),
  ('between', 'mellan'),
  ('two', 'två'),
  ('separate', 'enskilda'),
  ('things.', 'saker.')],
 [('the', 'de'),
  ('macro-economic', 'makroekonomiska'),
  ('indicators', 'indikatorerna'),
  ('have', 'har'),
  ('been', 'utvecklats'),
  ('very', 'mycket'),
  ('positive', 'positivt'),
  ('in', 'i'),
  ('bulgaria.', 'bulgarien.')]]

In [18]:
# Q A.4.8

rdd_en_sv_6 = rdd_en_sv_5.flatMap(lambda x: x).map(lambda word: (word, 1)).reduceByKey(add)

rdd_en_sv_6.take(5)

[(('in', 'i'), 1932),
 (('more', 'fler'), 71),
 (('a', 'en'), 2888),
 (('clear', 'bra'), 1),
 (('good', 'tydlig'), 1)]

In [19]:
# Q A.4.9

rdd_en_sv_6.takeOrdered(10, key=lambda x: -x[1])

[(('is', 'är'), 10040),
 (('we', 'vi'), 5530),
 (('i', 'jag'), 5020),
 (('this', 'detta'), 3252),
 (('closed.', 'avslutad.'), 2964),
 (('and', 'och'), 2917),
 (('a', 'en'), 2888),
 (('it', 'det'), 2866),
 (('that', 'det'), 2806),
 (('not', 'inte'), 2650)]

In [20]:
spark_session.stop()