In [21]:
from pyspark.sql import SparkSession

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.113:7077") \
        .appName("JunjieChuA3PartA")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext
spark_context.setLogLevel("INFO")

In [22]:
#QA1 read file
lines1 = spark_context.textFile("hdfs://192.168.2.113:9000/europarl/europarl-v7.pl-en.en")
lines2 = spark_context.textFile("hdfs://192.168.2.113:9000/europarl/europarl-v7.pl-en.pl")
print(lines1.count())
print(lines2.count())
print(lines1.getNumPartitions())
print(lines2.getNumPartitions())

632565
632565
2
2


In [23]:
#QA2 preprocess
def preprocessmap(lines):
    lowlines=lines.map(lambda line: line.lower())
    words=lowlines.map(lambda word: word.split(' '))
    return(words)

def preprocessflatmap(lines):
    lowlines=lines.map(lambda line: line.lower())
    words=lowlines.flatMap(lambda word: word.split(' '))
    return(words)
    

In [24]:
wordsE = preprocessmap(lines1)
wordsP = preprocessmap(lines2)
print('En 10:')
print(wordsE.take(10))
print('Pl 10:')
print(wordsP.take(10))
print(f'lines number of en: {wordsE.count()}\n')
print(f'lines number of pl: {wordsP.count()}\n')

En 10:
[['action', 'taken', 'on', "parliament's", 'resolutions:', 'see', 'minutes'], ['documents', 'received:', 'see', 'minutes'], ['written', 'statements', '(rule', '116):', 'see', 'minutes'], ['texts', 'of', 'agreements', 'forwarded', 'by', 'the', 'council:', 'see', 'minutes'], ['membership', 'of', 'parliament:', 'see', 'minutes'], ['membership', 'of', 'committees', 'and', 'delegations:', 'see', 'minutes'], ['future', 'action', 'in', 'the', 'field', 'of', 'patents', '(motions', 'for', 'resolutions', 'tabled):', 'see', 'minutes'], ['agenda', 'for', 'next', 'sitting:', 'see', 'minutes'], ['closure', 'of', 'sitting'], ['(the', 'sitting', 'was', 'closed', 'at', '11.55', 'p.m.)']]
Pl 10:
[['działania', 'podjęte', 'w', 'wyniku', 'rezolucji', 'parlamentu:', 'patrz', 'protokól'], ['składanie', 'dokumentów:', 'patrz', 'protokół'], ['oświadczenia', 'pisemne', '(art.', '116', 'regulaminu):', 'patrz', 'protokół'], ['teksty', 'porozumień', 'przekazane', 'przez', 'radę:', 'patrz', 'protokół'], ['s

In [25]:
#QA3.compute the 10 most frequently according words
words1=preprocessflatmap(lines1)
words2=preprocessflatmap(lines2)

def countwords(words):
    countwords = words\
        .map(lambda key: (key,1))\
        .reduceByKey(lambda x,y:x+y)\
        .sortBy(lambda x:x[1],False)
    return (countwords)

countwords1=countwords(words1)
countwords2=countwords(words2)

print(countwords1.take(10))
print(countwords2.take(10))

[('the', 1157014), ('of', 558149), ('to', 505756), ('and', 451165), ('in', 370228), ('a', 255921), ('that', 253400), ('is', 247419), ('for', 186707), ('we', 172260)]
[('w', 488954), ('i', 342893), ('na', 220274), ('z', 189566), ('do', 163373), ('że', 156657), ('się', 155444), ('nie', 138417), ('jest', 118263), ('to', 97921)]


In [26]:
#QA4 translations
def preprocessmapindex(lines):
    lowlines=lines.map(lambda line: line.lower())
    words=lowlines.map(lambda word: word.split(' ')).zipWithIndex()
    wordsnew = words.map(lambda word:(word[1],word[0]))    
    return(wordsnew)

wordsa = preprocessmapindex(lines1)
wordsb = preprocessmapindex(lines2)

#check if wordsa and wordsb are what I want
print(wordsa.take(5))
print("\n")
print(wordsb.filter(lambda x:len(x[1])==1).take(15))#In Pl, most sentences having only 1 word only have a number.


[(0, ['action', 'taken', 'on', "parliament's", 'resolutions:', 'see', 'minutes']), (1, ['documents', 'received:', 'see', 'minutes']), (2, ['written', 'statements', '(rule', '116):', 'see', 'minutes']), (3, ['texts', 'of', 'agreements', 'forwarded', 'by', 'the', 'council:', 'see', 'minutes']), (4, ['membership', 'of', 'parliament:', 'see', 'minutes'])]


[(15, ['1.']), (17, ['2.']), (19, ['3.']), (21, ['4.']), (23, ['5.']), (25, ['6.']), (27, ['7.']), (29, ['8.']), (31, ['9.']), (33, ['10.']), (35, ['11.']), (37, ['12.']), (39, ['13.']), (41, ['14.']), (43, ['15.'])]


In [28]:
pairs = wordsa\
    .join(wordsb)\
    .sortBy(lambda x:x[0],True)

#remove Null lines
#use short sentences
pairs = pairs\
    .filter(lambda x: len(x[1][0])>=1)\
    .filter(lambda x: len(x[1][1])>=1)\
    .filter(lambda x: len(x[1][0])<=7)\
    .filter(lambda x: len(x[1][1])<=7)\
    .filter(lambda x: len(x[1][1])==len(x[1][0]))

print(f'lines number: {pairs.count()}\n')
print(pairs.take(5))

pairs = pairs\
    .flatMap(lambda pair:(list(zip(pair[1][0],pair[1][1]))))\
    .map(lambda p:(p,1))\
    .reduceByKey(lambda x,y:x+y)\
    .sortBy(lambda x:x[1],False)

print('\n')
print(pairs.take(15))

lines number: 16457

[(1, (['documents', 'received:', 'see', 'minutes'], ['składanie', 'dokumentów:', 'patrz', 'protokół'])), (7, (['agenda', 'for', 'next', 'sitting:', 'see', 'minutes'], ['porządek', 'dzienny', 'następnego', 'posiedzenia:', 'patrz', 'protokół'])), (12, (['documents', 'received:', 'see', 'minutes'], ['składanie', 'dokumentów:', 'patrz', 'protokół'])), (15, (['1.'], ['1.'])), (17, (['2.'], ['2.']))]


[(('(applause)', '(oklaski)'), 1653), (('is', 'jest'), 595), (('see', 'patrz'), 549), (('minutes', 'protokół'), 525), (('-', '-'), 451), (('1.', '1.'), 354), (('2.', '2.'), 353), (('3.', '3.'), 334), (('in', 'w'), 311), (('is', 'to'), 304), (('and', 'i'), 282), (('this', 'to'), 245), (('(', '('), 244), (('that', 'to'), 233), (('documents', 'składanie'), 215)]


In [20]:
# release the cores for another application!
spark_context.stop()