<a href="https://colab.research.google.com/github/Mostafadars/Wikimedia-BD-Project/blob/main/bd_ass_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Install the PySpark Package***

In [None]:
#pip install pyspark

# **Run This if you work on the Google Colab**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# ***Import Spark Modules***

In [None]:

from pyspark import SparkContext
import time
import re


# ***Initialize The Spark Session and Read the Data***

In [None]:
# Initialize Spark session
sc = SparkContext(appName="WikimediaPageViews")

# Uncomment the line below if you work on google colab
# Example path: "/content/drive/MyDrive/Big Data Ass 2/pagecounts-20160101-000000_parsed.out"
data_path = "/content/drive/MyDrive/pagecounts-20160101-000000_parsed.out"

# Uncomment the line below if you work on local
#data_path = "./pagecounts-20160101-000000_parsed.out"

lines = sc.textFile(data_path)

In [None]:
# Load and parse the data
def parse_line(line):
    parts = line.strip().split(' ')
    if len(parts) < 4:
        return None
    project, title, hits, size = parts[0], parts[1], int(parts[2]), int(parts[3])
    return (project, title, hits, size)

In [None]:
parsed = lines.map(parse_line).filter(lambda x: x is not None)

# ***Function 1: Compute min, max, and average page size***

In [None]:
# Function_1 - Spark Map Reduce

def function_1_map_reduce(parsed_rdd):
    start = time.time()
    print(f"Start time: {start}")

    # Extract page sizes
    page_sizes = parsed_rdd.map(lambda x: x[3])  # x[3] is page size

    # MapReduce operations
    min_size = page_sizes.min()
    max_size = page_sizes.max()

    total_size = page_sizes.sum()
    count = page_sizes.count()
    avg_size = total_size / count if count > 0 else 0

    end = time.time()
    print(f"End time: {end}")

    print(f"Min Page Size: {min_size}")
    print(f"Max Page Size: {max_size}")
    print(f"Average Page Size: {avg_size:.2f}")


# running function 1
function_1_map_reduce(parsed)

Start time: 1747382962.4384491
End time: 1747383008.8927279
Min Page Size: 0
Max Page Size: 141180155987
Average Page Size: 132239.57


In [None]:
# Function_1 - Spark Loops
def function_1_spark_loop(rdd):
    print("\nQ1 Spark foreach")
    start = time.time()
    print(f"Start Time: {start}")

    sum_size = sc.accumulator(0)
    count = sc.accumulator(0)

    min_size = float('inf')
    max_size = float('-inf')

    def update_stats(x):
        size = x[3]
        sum_size.add(size)
        count.add(1)
        return size

    rdd.foreach(update_stats)

    sizes = rdd.map(lambda x: x[3]).collect()
    min_size = min(sizes)
    max_size = max(sizes)

    end = time.time()
    print(f"End Time: {end}")
    print(f"Min page size: {min_size}")
    print(f"Max page size: {max_size}")
    print(f"Average page size: {sum_size.value / count.value:.2f}")
    print(f"Time: {end - start:.4f} seconds")

function_1_spark_loop(parsed)


Q1 Spark foreach
Start Time: 1747383008.9113514
End Time: 1747383032.1261582
Min page size: 0
Max page size: 141180155987
Average page size: 132239.57
Time: 23.2148 seconds


In [None]:
# Function_1 - Normal Loops
def function_1_normal_loop(rdd):
    start_time = time.time()
    min_page_title = float('inf')
    max_page_title = float('-inf')
    sum_page_title = 0
    count = 0

    for _, _, _, value in rdd.toLocalIterator():
        if value < min_page_title:
            min_page_title = value

        if value > max_page_title:
            max_page_title = value

        sum_page_title += value
        count += 1

    avg_page_title = sum_page_title / count

    print("Minimum page title size:", min_page_title)
    print("Maximum page title size:", max_page_title)
    print("Average page title size:", avg_page_title)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print("Elapsed time:", elapsed_time, "seconds")


function_1_normal_loop(parsed)

Minimum page title size: 0
Maximum page title size: 141180155987
Average page title size: 132239.56957446598
Elapsed time: 15.96462106704712 seconds


# ***Function 2: Count page titles starting with "The" not in English project***

In [None]:
# Function_2 - Spark Map Reduce
def function_2_map_reduce(parsed_rdd):
    start = time.time()
    print(f"Start time: {start}")

    titles_with_the = parsed_rdd.filter(lambda x: x[1].startswith("The")) # x[1] is title

    count_titles_with_the = titles_with_the.count()

    non_english_titles_with_the = titles_with_the.filter(lambda x: x[0] != "en") # x[0] is project code

    # Count how many non-English titles start with "The"
    count_non_english = non_english_titles_with_the.count()

    end = time.time()
    print(f"End time: {end}")

    print(f"Total titles starting with 'The': {count_titles_with_the}")
    print(f"Titles starting with 'The' and NOT in English project: {count_non_english}")


# running function 2
function_2_map_reduce(parsed)


Start time: 1747383048.1531687
End time: 1747383067.318152
Total titles starting with 'The': 45020
Titles starting with 'The' and NOT in English project: 10292


In [None]:
# Function_2 - Spark Loops
def function_2_spark_loop(rdd):
    print("\nQ2 Spark foreach")
    start = time.time()
    print(f"Start Time: {start:.4f}")

    total_count = sc.accumulator(0)
    not_en_count = sc.accumulator(0)

    def count_titles(x):
        if x[1].startswith("The"):
            total_count.add(1)
            if x[0] != "en":
                not_en_count.add(1)

    rdd.foreach(count_titles)

    end = time.time()
    print(f"End Time: {end:.4f}")
    print(f"Titles starting with 'The': {total_count.value}")
    print(f"Titles not in English: {not_en_count.value}")
    print(f"Time: {end - start:.4f} seconds")


function_2_spark_loop(parsed)


Q2 Spark foreach
Start Time: 1747383067.3315
End Time: 1747383078.4114
Titles starting with 'The': 45020
Titles not in English: 10292
Time: 11.0799 seconds


In [None]:
# Function_2 - Normal Loops
def function_2_normal_loop(rdd):
    print("\nQ2 Normal Loops")
    total = 0
    not_en = 0
    start = time.time()
    print(f"Start Time: {start:.4f}")

    for x in rdd.toLocalIterator():
        if x[1].startswith("The"):
            total += 1
            if x[0] != "en":
                not_en += 1

    end = time.time()
    print(f"End Time: {end:.4f}")
    print(f"Titles starting with 'The': {total}")
    print(f"Titles not in English: {not_en}")
    print(f"Time: {end - start:.4f} seconds")


function_2_normal_loop(parsed)


Q2 Normal Loops
Start Time: 1747383078.4223
End Time: 1747383093.6348
Titles starting with 'The': 45020
Titles not in English: 10292
Time: 15.2126 seconds


# ***Function 3: Count unique terms in page titles***

In [None]:
# Function_3 - Spark Map Reduce
def function_3_map_reduce(parsed_rdd):
    print("\nQ3 Spark MapReduce")

    start = time.time()
    print(f"Start Time: {start}")

    unique_terms = (
        parsed_rdd
        .flatMap(lambda x: re.split(r'_', x[1].lower()))
        .map(lambda term: re.sub(r'[^a-zA-Z0-9]', '', term))
        .filter(lambda term: term != "")
        .distinct()
    )

    count_unique = unique_terms.count()

    end = time.time()
    print(f"End Time: {end}")
    print(f"Number of unique terms: {count_unique}")
    print(f"Time: {end - start:.4f} seconds")

function_3_map_reduce(parsed)



Q3 Spark MapReduce
Start Time: 1747383093.645378
End Time: 1747383137.2551167
Number of unique terms: 1688528
Time: 43.6097 seconds


In [None]:
# Function_3 - Spark Loops
def function_3_spark_loop(rdd):
    print("\nQ3 Spark foreach")
    start = time.time()
    print(f"Start Time: {start}")

    all_terms = rdd.flatMap(lambda x: re.split(r'_', x[1].lower())).collect()

    unique_terms = set(all_terms)

    end = time.time()
    print(f"End Time: {end}")
    print(f"Number of unique terms: {len(unique_terms)}")
    print(f"Time: {end - start:.4f} seconds")


function_3_spark_loop(parsed)


Q3 Spark foreach
Start Time: 1747383137.265265
End Time: 1747383156.082223
Number of unique terms: 1793113
Time: 18.8170 seconds


In [None]:
# Function_3 - Normal Loops
def function_3_normal_loop(rdd):
    print("\nQ3 Normal Loops")
    start = time.time()
    print(f"Start Time: {start}")

    terms = set()
    for x in rdd.toLocalIterator():
        for term in re.split(r'_', x[1].lower()):
            terms.add(re.sub(r'[^a-zA-Z0-9]', '', term))

    end = time.time()
    print(f"End Time: {end}")
    print(f"Unique terms: {len(terms)}")
    print(f"Time: {end - start:.4f} seconds")


function_3_normal_loop(parsed)


Q3 Normal Loops
Start Time: 1747383156.4555755
End Time: 1747383184.9158494
Unique terms: 1688529
Time: 28.4603 seconds


# ***Function 4: Extract title counts***

In [None]:
# Function_4 - Spark Map Reduce
def function_4_map_reduce(parsed_rdd):
    print("\nQ4 Spark MapReduce")
    import time

    start = time.time()
    print(f"Start Time: {start}")

    title_counts = (
        parsed_rdd
        .map(lambda x: (x[1], 1))
        .reduceByKey(lambda a, b: a + b)
    )

    results = title_counts.take(5)

    end = time.time()
    print(f"End Time: {end}")
    print("First 5 title counts:")
    for title, count in results:
        print(f"{title}: {count}")
    print(f"Time: {end - start:.4f} seconds")

function_4_map_reduce(parsed)


Q4 Spark MapReduce
Start Time: 1747383185.0610638
End Time: 1747383205.7255986
First 5 title counts:
Indonesian_Wikipedia: 2
Special:MyLanguage/Meta:Index: 1
Special:WhatLinksHere/Main_Page: 8
Special:WhatLinksHere/MediaWiki:Edittools: 1
User:IlStudioso: 1
Time: 20.6645 seconds


In [None]:
# Function_4 - Spark Loops
def function_4_spark_loop(rdd):
    print("\nQ4 Spark foreach")
    start = time.time()
    print(f"Start Time: {start}")

    title_counts = rdd.map(lambda x: (x[1], 1)).collect()

    counts = {}
    for title, count in title_counts:
        counts[title] = counts.get(title, 0) + count

    end = time.time()
    print(f"End Time: {end}")
    print("First 5 title counts:")
    for i, (title, count) in enumerate(list(counts.items())[:5]):
        print(f"{title}: {count}")
    print(f"Time: {end - start:.4f} seconds")


function_4_spark_loop(parsed)


Q4 Spark foreach
Start Time: 1747383205.7440257
End Time: 1747383220.9278905
First 5 title counts:
271_a.C: 4
Category:User_th: 2
Chiron_Elias_Krase: 6
Dassault_rafaele: 3
E.Desv: 6
Time: 15.1839 seconds


In [None]:
# Function_4 - Normal Loops
def function_4_normal_loop(rdd):
    print("\nQ4 Normal Loops")
    start = time.time()
    print(f"Start Time: {start}")

    counts = {}
    for x in rdd.toLocalIterator():
        counts[x[1]] = counts.get(x[1], 0) + 1

    end = time.time()
    print(f"End Time: {end}")

    results = list(counts.items())[:5]
    end = time.time()
    print(f"End Time: {end}")
    print("First 5 title counts:")
    for title, count in results:
        print(f"{title}: {count}")
    print(f"Time: {end - start:.4f} seconds")


function_4_normal_loop(parsed)


Q4 Normal Loops
Start Time: 1747383221.7312584
End Time: 1747383238.4559064
End Time: 1747383238.9977074
First 5 title counts:
271_a.C: 4
Category:User_th: 2
Chiron_Elias_Krase: 6
Dassault_rafaele: 3
E.Desv: 6
Time: 17.2664 seconds


# ***Function 5: Combine pages with same title***

In [None]:
# Function_5 - Spark Map Reduce
def function_5_map_reduce(parsed_rdd):
    print("\nQ5 Spark MapReduce")
    import time

    start = time.time()
    print(f"Start Time: {start}")

    grouped_by_title = (
        parsed_rdd
        .map(lambda x: (x[1], [x]))
        .reduceByKey(lambda a, b: a + b)
        .filter(lambda x: len(x[1]) > 1)
    )

    results = grouped_by_title.take(5)

    end = time.time()
    print(f"End Time: {end}")
    print("First 5 titles with multiple pages:")
    for title, records in results:
        print(f"\nTitle: {title}")
        print("Pages:")
        for rec in records:
            print(f"  Project: {rec[0]}, Hits: {rec[2]}, Size: {rec[3]}")
    print(f"Time: {end - start:.4f} seconds")

function_5_map_reduce(parsed)



Q5 Spark MapReduce
Start Time: 1747383239.107877
End Time: 1747383267.8825572
First 5 titles with multiple pages:

Title: Indonesian_Wikipedia
Pages:
  Project: aa, Hits: 1, Size: 4679
  Project: en, Hits: 1, Size: 93905

Title: Special:WhatLinksHere/Main_Page
Pages:
  Project: aa, Hits: 1, Size: 5556
  Project: commons.m, Hits: 2, Size: 15231
  Project: en, Hits: 5, Size: 101406
  Project: en.s, Hits: 1, Size: 8597
  Project: en.voy, Hits: 1, Size: 8550
  Project: meta.m, Hits: 1, Size: 11529
  Project: outreach.m, Hits: 1, Size: 5698
  Project: simple, Hits: 3, Size: 32145

Title: User_talk:Logan
Pages:
  Project: aa, Hits: 1, Size: 4734
  Project: en.voy, Hits: 5, Size: 78175

Title: Special:UserLogin
Pages:
  Project: aa.d, Hits: 1, Size: 4899
  Project: commons.m, Hits: 30, Size: 181938
  Project: en, Hits: 44198, Size: 718770014
  Project: en.q, Hits: 4, Size: 34449
  Project: incubator.m, Hits: 1, Size: 5221
  Project: m.f, Hits: 13, Size: 58547
  Project: m.w, Hits: 3, Size: 1

In [None]:
# Function_5 - Spark Loops
def function_5_spark_loop(rdd):
    print("\nQ5 Spark foreach")
    start = time.time()
    print(f"Start Time: {start}")

    title_groups = rdd.map(lambda x: (x[1], x)).collect()

    grouped = {}
    for title, record in title_groups:
        if title not in grouped:
            grouped[title] = []
        grouped[title].append(record)

    multiple_pages = {k: v for k, v in grouped.items() if len(v) > 1}

    end = time.time()
    print(f"End Time: {end}")
    print("First 5 titles with multiple pages:")
    for i, (title, pages) in enumerate(list(multiple_pages.items())[:5]):
        print(f"\nTitle: {title}")
        print("Pages:")
        for page in pages:
            print(f"  Project: {page[0]}, Hits: {page[2]}, Size: {page[3]}")
    print(f"Time: {end - start:.4f} seconds")


function_5_spark_loop(parsed)


Q5 Spark foreach
Start Time: 1747383267.8959599
End Time: 1747383292.9264278
First 5 titles with multiple pages:

Title: 271_a.C
Pages:
  Project: aa, Hits: 1, Size: 4675
  Project: az, Hits: 1, Size: 6356
  Project: bcl, Hits: 1, Size: 5068
  Project: be, Hits: 1, Size: 6287

Title: Category:User_th
Pages:
  Project: aa, Hits: 1, Size: 4770
  Project: commons.m, Hits: 1, Size: 0

Title: Chiron_Elias_Krase
Pages:
  Project: aa, Hits: 1, Size: 4694
  Project: az, Hits: 1, Size: 6374
  Project: bg, Hits: 1, Size: 7468
  Project: cho, Hits: 1, Size: 4684
  Project: dz, Hits: 1, Size: 5435
  Project: it, Hits: 1, Size: 5929

Title: Dassault_rafaele
Pages:
  Project: aa, Hits: 2, Size: 9372
  Project: en, Hits: 1, Size: 6649
  Project: it, Hits: 1, Size: 5919

Title: E.Desv
Pages:
  Project: aa, Hits: 1, Size: 4662
  Project: arc, Hits: 1, Size: 5210
  Project: ast, Hits: 1, Size: 4825
  Project: fiu-vro, Hits: 1, Size: 5237
  Project: fr, Hits: 1, Size: 7057
  Project: ik, Hits: 1, Size: 

In [None]:
# Function_5 - Normal Loops
def function_5_normal_loop(rdd):
    print("\nQ5 Normal Loops")
    start = time.time()
    print(f"Start Time: {start}")

    data = rdd.toLocalIterator()
    grouped = {}
    for x in data:
        grouped.setdefault(x[1], []).append(x)
    results = [(k, v) for k, v in grouped.items() if len(v) > 1][:5]

    end = time.time()
    print(f"End Time: {end}")
    print("First 5 titles with multiple pages:")
    for title, pages in results:
        print(f"\nTitle: {title}")
        print("Pages:")
        for page in pages:
            print(f"  Project: {page[0]}, Hits: {page[2]}, Size: {page[3]}")
    print(f"Time: {end - start:.4f} seconds")


function_5_normal_loop(parsed)


Q5 Normal Loops
Start Time: 1747383294.4395084
End Time: 1747383313.0385537
First 5 titles with multiple pages:

Title: 271_a.C
Pages:
  Project: aa, Hits: 1, Size: 4675
  Project: az, Hits: 1, Size: 6356
  Project: bcl, Hits: 1, Size: 5068
  Project: be, Hits: 1, Size: 6287

Title: Category:User_th
Pages:
  Project: aa, Hits: 1, Size: 4770
  Project: commons.m, Hits: 1, Size: 0

Title: Chiron_Elias_Krase
Pages:
  Project: aa, Hits: 1, Size: 4694
  Project: az, Hits: 1, Size: 6374
  Project: bg, Hits: 1, Size: 7468
  Project: cho, Hits: 1, Size: 4684
  Project: dz, Hits: 1, Size: 5435
  Project: it, Hits: 1, Size: 5929

Title: Dassault_rafaele
Pages:
  Project: aa, Hits: 2, Size: 9372
  Project: en, Hits: 1, Size: 6649
  Project: it, Hits: 1, Size: 5919

Title: E.Desv
Pages:
  Project: aa, Hits: 1, Size: 4662
  Project: arc, Hits: 1, Size: 5210
  Project: ast, Hits: 1, Size: 4825
  Project: fiu-vro, Hits: 1, Size: 5237
  Project: fr, Hits: 1, Size: 7057
  Project: ik, Hits: 1, Size: 4

# ***End The Spark Session***

In [None]:
# Stop Spark session
sc.stop()