# ***Install the PySpark Package***

In [None]:
#pip install pyspark

# **Run This if you work on the Google Colab**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# ***Import Spark Modules***

In [59]:

from pyspark import SparkContext
import time
import re


# ***Initialize The Spark Session and Read the Data***

In [60]:
# Initialize Spark session
sc = SparkContext(appName="WikimediaPageViews")

# Uncomment the line below if you work on google colab
# Example path: "/content/drive/MyDrive/Big Data Ass 2/pagecounts-20160101-000000_parsed.out"
# data_path = "put-your-content-drive-path-to-the-input-file-here/pagecounts-20160101-000000_parsed.out"

# Uncomment the line below if you work on local
data_path = "./pagecounts-20160101-000000_parsed.out"

lines = sc.textFile(data_path)

In [61]:
# Load and parse the data
def parse_line(line):
    parts = line.strip().split(' ')
    if len(parts) < 4:
        return None
    project, title, hits, size = parts[0], parts[1], int(parts[2]), int(parts[3])
    return (project, title, hits, size)

In [62]:
parsed = lines.map(parse_line).filter(lambda x: x is not None)

# ***Function 1: Compute min, max, and average page size***

In [None]:
# Function_1 - Spark Map Reduce

def function_1(parsed_rdd):
    
    # Extract page sizes
    page_sizes = parsed_rdd.map(lambda x: x[3])  # x[3] is page size
    
    # MapReduce operations
    min_size = page_sizes.min()
    max_size = page_sizes.max()
    
    total_size = page_sizes.sum()
    count = page_sizes.count()
    avg_size = total_size / count if count > 0 else 0
    
    print(f"Min Page Size: {min_size}")
    print(f"Max Page Size: {max_size}")
    print(f"Average Page Size: {avg_size:.2f}")

    
# running function 1   
function_1(parsed)

Min Page Size: 0
Max Page Size: 141180155987
Average Page Size: 132239.57


In [None]:
# Function_1 - Spark Loops

In [None]:
# Function_1 - Normal Loops

# ***Function 2: Count page titles starting with "The" not in English project***

In [None]:
# Function_2 - Spark Map Reduce

def function_2(parsed_rdd):
    
    titles_with_the = parsed_rdd.filter(lambda x: x[1].startswith("The")) # x[1] is title
    
    count_titles_with_the = titles_with_the.count()
    
    non_english_titles_with_the = titles_with_the.filter(lambda x: x[0] != "en") # x[0] is project code
    
    # Count how many non-English titles start with "The"
    count_non_english = non_english_titles_with_the.count()
    
    print(f"Total titles starting with 'The': {count_titles_with_the}")
    print(f"Titles starting with 'The' and NOT in English project: {count_non_english}")
    
    
# running function 2
function_2(parsed)


Total titles starting with 'The': 45020
Titles starting with 'The' NOT in English project: 10292


In [None]:
# Function_2 - Spark Loops

In [None]:
# Function_2 - Normal Loops

# ***Function 3: Count unique terms in page titles***

In [None]:
# Function_3 - Spark Map Reduce

In [None]:
# Function_3 - Spark Loops

In [None]:
# Function_3 - Normal Loops

# ***Function 4: Extract title counts***

In [None]:
# Function_4 - Spark Map Reduce

In [None]:
# Function_4 - Spark Loops

In [None]:
# Function_4 - Normal Loops

# ***Function 5: Combine pages with same title***

In [None]:
# Function_5 - Spark Map Reduce

In [None]:
# Function_5 - Spark Loops

In [None]:
# Function_5 - Normal Loops

# ***End The Spark Session***

In [57]:
# Stop Spark session
sc.stop()