In [1]:
from pyspark.sql import SparkSession
from operator import add

spark_session = SparkSession.builder \
        .master("spark://192.168.2.156:7077") \
        .appName("Zijie_V3_A") \
        .config("spark.dynamicAllocation.enabled", True) \
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True) \
        .config("spark.shuffle.service.enabled", False) \
        .config("spark.dynamicAllocation.executorIdleTimeout", "30s") \
        .config("spark.executor.cores", 2) \
        .config("spark.driver.port", 9999) \
        .config("spark.blockManager.port", 10005) \
        .getOrCreate()

# RDD API
spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/10 10:24:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
spark_session.stop()

A.1

In [5]:
# Load English text data from HDFS
english_rdd = spark_context.textFile("hdfs://192.168.2.156:9000/data/europarl/europarl-v7.sv-en.en")

# Count the number of lines in English RDD
english_line_count = english_rdd.count()
print("the number of lines of English RDD:", english_line_count)


[Stage 2:>                                                          (0 + 2) / 2]

the number of lines of English RDD: 1862234


                                                                                

In [6]:
# Load Swedish text data from HDFS
swedish_rdd = spark_context.textFile("hdfs://192.168.2.156:9000/data/europarl/europarl-v7.sv-en.sv")

# Count the number of lines in Swedish RDD
swedish_line_count = swedish_rdd.count()
print("the number of lines of Swedish RDD:", swedish_line_count)




the number of lines of Swedish RDD: 1862234


                                                                                

In [10]:
if english_line_count == swedish_line_count:
    print("The line counts match between English and Swedish RDDs!")
else:
    print("Line counts do NOT match, check the data sources!")


The line counts match between English and Swedish RDDs!


In [8]:
# Check number of partitions
english_partitions = english_rdd.getNumPartitions()
swedish_partitions = swedish_rdd.getNumPartitions()

print(f"English RDD has {english_partitions} partitions.")
print(f"Swedish RDD has {swedish_partitions} partitions.")


English RDD has 2 partitions.
Swedish RDD has 3 partitions.


A.2

In [11]:
# Define a function to lowercase and tokenize text
def preprocess_text(line):
    return line.lower().split()  # Convert to lowercase and split by space

# Apply text preprocessing to English RDD
preprocessed_english_rdd = english_rdd.map(preprocess_text)

# Apply text preprocessing to Swedish RDD
preprocessed_swedish_rdd = swedish_rdd.map(preprocess_text)

# Show 10 entries from preprocessed English RDD
print("Sample of preprocessed English RDD:")
print(preprocessed_english_rdd.take(10))

# Show 10 entries from preprocessed Swedish RDD
print("Sample of preprocessed Swedish RDD:")
print(preprocessed_swedish_rdd.take(10))

preprocessed_english_line_count = preprocessed_english_rdd.count()
preprocessed_swedish_line_count = preprocessed_swedish_rdd.count()

if preprocessed_english_line_count == preprocessed_swedish_line_count:
    print("The line counts still match after preprocessing!")
else:
    print("Line counts do NOT match after preprocessing!")


Sample of preprocessed English RDD:


                                                                                

[['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'on', 'behalf', 'of', '

                                                                                

[['återupptagande', 'av', 'sessionen'], ['jag', 'förklarar', 'europaparlamentets', 'session', 'återupptagen', 'efter', 'avbrottet', 'den', '17', 'december.', 'jag', 'vill', 'på', 'nytt', 'önska', 'er', 'ett', 'gott', 'nytt', 'år', 'och', 'jag', 'hoppas', 'att', 'ni', 'haft', 'en', 'trevlig', 'semester.'], ['som', 'ni', 'kunnat', 'konstatera', 'ägde', '"den', 'stora', 'år', '2000-buggen"', 'aldrig', 'rum.', 'däremot', 'har', 'invånarna', 'i', 'ett', 'antal', 'av', 'våra', 'medlemsländer', 'drabbats', 'av', 'naturkatastrofer', 'som', 'verkligen', 'varit', 'förskräckliga.'], ['ni', 'har', 'begärt', 'en', 'debatt', 'i', 'ämnet', 'under', 'sammanträdesperiodens', 'kommande', 'dagar.'], ['till', 'dess', 'vill', 'jag', 'att', 'vi,', 'som', 'ett', 'antal', 'kolleger', 'begärt,', 'håller', 'en', 'tyst', 'minut', 'för', 'offren', 'för', 'bl.a.', 'stormarna', 'i', 'de', 'länder', 'i', 'europeiska', 'unionen', 'som', 'drabbats.'], ['jag', 'ber', 'er', 'resa', 'er', 'för', 'en', 'tyst', 'minut.'], 



The line counts still match after preprocessing!


                                                                                

A.3

In [12]:
from operator import add

# Count word frequency in English corpus
english_word_counts = preprocessed_english_rdd.flatMap(lambda line: line) \
                                              .map(lambda word: (word, 1)) \
                                              .reduceByKey(add)

# Count word frequency in Swedish corpus
swedish_word_counts = preprocessed_swedish_rdd.flatMap(lambda line: line) \
                                              .map(lambda word: (word, 1)) \
                                              .reduceByKey(add)

# Get top 10 most frequent words in English corpus
top_10_english_words = english_word_counts.takeOrdered(10, key=lambda x: -x[1])
print("🔹 Top 10 words in English corpus:", top_10_english_words)

# Get top 10 most frequent words in Swedish corpus
top_10_swedish_words = swedish_word_counts.takeOrdered(10, key=lambda x: -x[1])
print("🔹 Top 10 words in Swedish corpus:", top_10_swedish_words)


                                                                                

🔹 Top 10 words in English corpus: [('the', 3498574), ('of', 1659884), ('to', 1539823), ('and', 1288620), ('in', 1086089), ('that', 797576), ('a', 773812), ('is', 758087), ('for', 534270), ('we', 522879)]




🔹 Top 10 words in Swedish corpus: [('att', 1706309), ('och', 1344895), ('i', 1050989), ('det', 924878), ('som', 913302), ('för', 908703), ('av', 738102), ('är', 694389), ('en', 620347), ('vi', 539808)]


                                                                                

A.4

In [13]:
# Assign an index to each line in both RDDs
english_indexed_rdd = preprocessed_english_rdd.zipWithIndex().map(lambda x: (x[1], x[0]))
swedish_indexed_rdd = preprocessed_swedish_rdd.zipWithIndex().map(lambda x: (x[1], x[0]))

# Join both RDDs using the line index as the key
aligned_sentences_rdd = english_indexed_rdd.join(swedish_indexed_rdd)

# Filter out empty/missing sentences
filtered_sentences_rdd = aligned_sentences_rdd.filter(lambda x: x[1][0] and x[1][1])

# Keep only short sentences (e.g., less than 10 words)
filtered_sentences_rdd = filtered_sentences_rdd.filter(lambda x: len(x[1][0]) < 10 and len(x[1][1]) < 10)

# Keep only sentences with the same number of words
filtered_sentences_rdd = filtered_sentences_rdd.filter(lambda x: len(x[1][0]) == len(x[1][1]))

# Generate word pairs (word in English, corresponding word in Swedish)
word_pairs_rdd = filtered_sentences_rdd.flatMap(lambda x: zip(x[1][0], x[1][1]))

from operator import add

# Count occurrences of each translation pair
word_pair_counts = word_pairs_rdd.map(lambda pair: (pair, 1)).reduceByKey(add)

# Get top 10 most frequent translation pairs
top_10_word_pairs = word_pair_counts.takeOrdered(10, key=lambda x: -x[1])
print("Top 10 translation word pairs:", top_10_word_pairs)




Top 10 translation word pairs: [(('is', 'är'), 10070), (('we', 'vi'), 5539), (('i', 'jag'), 5040), (('this', 'detta'), 3257), (('closed.', 'avslutad.'), 2980), (('and', 'och'), 2926), (('a', 'en'), 2892), (('it', 'det'), 2868), (('that', 'det'), 2807), (('not', 'inte'), 2652)]


                                                                                