In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Create a SparkSession
spark = SparkSession.builder.appName("App").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

23/12/10 13:48:32 WARN Utils: Your hostname, user-HP-EliteBook-840-G7-Notebook-PC resolves to a loopback address: 127.0.1.1; using 192.168.1.141 instead (on interface wlp0s20f3)
23/12/10 13:48:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/10 13:48:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/12/10 13:48:34 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/12/10 13:48:34 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
23/12/10 13:48:34 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
23/12/10 13:48:34 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.


In [2]:
# Exercise 1

inputDf = spark.read.format("csv").option("delimiter", " ").load("pagecounts-20100806-030000")

inputDf = inputDf \
    .withColumnRenamed("_c0", "project_name") \
    .withColumnRenamed("_c1", "page_title") \
    .withColumnRenamed("_c2", "num_requests") \
    .withColumnRenamed("_c3", "content_size") \
    .withColumn("num_requests", col("num_requests").cast("long")) \
    .withColumn("content_size", col("content_size").cast("long"))

inputDf.printSchema()
inputDf.show(15, truncate=False)

root
 |-- project_name: string (nullable = true)
 |-- page_title: string (nullable = true)
 |-- num_requests: long (nullable = true)
 |-- content_size: long (nullable = true)

+------------+-----------------------------------------------+------------+------------+
|project_name|page_title                                     |num_requests|content_size|
+------------+-----------------------------------------------+------------+------------+
|aa.b        |Main_Page                                      |1           |4881        |
|aa.b        |Special:Contributions/Beau                     |1           |4938        |
|aa.b        |Special:WhatLinksHere/MediaWiki:Makesysopsubmit|1           |4550        |
|aa.b        |User:Ahoerstemeier                             |1           |4388        |
|aa.b        |User:Monobi                                    |1           |5500        |
|aa.d        |Special:Contributions/Les_Meloures             |1           |4949        |
|aa          |%D0%90%D1

In [3]:
# Exercise 2

print("Total number of elements:", inputDf.count())

print("Complete list of project names")
projectList = inputDf.select("project_name").distinct().rdd.map(lambda r: r[0]).collect()
print(projectList)

inputDf.filter(col("project_name") == "en") \
    .groupBy("project_name") \
    .agg(sum("content_size").alias("total_size")) \
    .show(truncate=False)

inputDf.filter(col("project_name") == "en") \
    .orderBy(col("num_requests").desc()) \
    .show(5, truncate=False)


                                                                                

Total number of elements: 4729148
Complete list of project names


                                                                                

['aa', 'aa.b', 'aa.d', 'ab', 'ab.mw', 'ace', 'af', 'af.b', 'af.d', 'af.mw', 'af.q', 'af.v', 'ak', 'ak.b', 'als', 'als.b', 'als.d', 'als.mw', 'als.n', 'als.q', 'am', 'am.d', 'am.q', 'an', 'an.d', 'an.mw', 'ang', 'ang.b', 'ang.d', 'ang.mw', 'ang.q', 'ar', 'ar.b', 'ar.d', 'ar.mw', 'ar.n', 'ar.q', 'ar.s', 'ar.v', 'arc', 'arz', 'arz.d', 'arz.s', 'as', 'as.b', 'as.d', 'ast', 'ast.b', 'ast.d', 'ast.mw', 'ast.q', 'av', 'av.d', 'ay', 'ay.b', 'ay.d', 'az', 'az.b', 'az.d', 'az.mw', 'az.q', 'az.s', 'ba', 'ba.b', 'ba.mw', 'bar', 'bar.d', 'bat-smg', 'bat-smg.d', 'bcl', 'bcl.d', 'be', 'be-x-old', 'be-x-old.d', 'be.b', 'be.d', 'be.mw', 'be.q', 'beta.v', 'bg', 'bg.b', 'bg.d', 'bg.mw', 'bg.n', 'bg.q', 'bg.s', 'bg.v', 'bh', 'bi', 'bi.b', 'bi.d', 'bm', 'bm.b', 'bm.d', 'bm.q', 'bn', 'bn.b', 'bn.d', 'bn.mw', 'bn.q', 'bn.s', 'bo', 'bo.b', 'bo.d', 'bpy', 'br', 'br.d', 'br.mw', 'br.q', 'bs', 'bs.b', 'bs.d', 'bs.mw', 'bs.n', 'bs.q', 'bs.s', 'bug', 'bug.d', 'bxr', 'ca', 'ca.b', 'ca.d', 'ca.mw', 'ca.n', 'ca.q', '

                                                                                

+------------+------------+
|project_name|total_size  |
+------------+------------+
|en          |299984572954|
+------------+------------+



                                                                                

+------------+------------------------+------------+------------+
|project_name|page_title              |num_requests|content_size|
+------------+------------------------+------------+------------+
|en          |Special:Random          |405305      |218224631   |
|en          |Special:Search          |222902      |561104989   |
|en          |Main_Page               |222302      |5025224410  |
|en          |404_error               |42051       |135095134   |
|en          |Special:Export/Wikipedia|32765       |107400209   |
+------------+------------------------+------------+------------+
only showing top 5 rows



In [4]:
# Exercise 2 with pure SQL

inputDf.createOrReplaceTempView("myFakeTable")
n_elmnts = spark.sql("select count(*) from myFakeTable").first()
print("Total number of elements:", n_elmnts[0])

print("Complete list of project names")
list = spark.sql("select distinct project_name from myFakeTable").collect()
# Extract project names
project_names = [row.project_name for row in list]
# Print
print(project_names)


spark.sql("select sum(content_size) from myFakeTable where project_name = 'en'").show()
spark.sql("select * from myFakeTable where project_name = 'en' order by num_requests desc limit 5").show()


                                                                                

Total number of elements: 4729148
Complete list of project names


                                                                                

['cbk-zam', 'co.b', 'cs.n', 'en', 'be-x-old.d', 'cr', 'as.d', 'dv.n', 'cr.d', 'crh', 'ast.q', 'als.n', 'ang.q', 'am.d', 'af.q', 'cy.d', 'an.mw', 'be.mw', 'cs.d', 'cy.s', 'bug.d', 'chr', 'cv', 'ba', 'en.v', 'ang', 'de.b', 'dv.d', 'av.d', 'bn.q', 'dz.d', 'bcl', 'bg.v', 'ceb', 'chr.d', 'arz.d', 'az.d', 'cs.q', 'cs.s', 'en.s', 'am.q', 'ay', 'bg.s', 'arz.s', 'ast.mw', 'bn.b', 'el.s', 'ce.d', 'arc', 'br.mw', 'da.s', 'bg.n', 'en.n', 'als', 'ast.d', 'da.d', 'da.q', 'en.mw', 'bug', 'cs.mw', 'cdo.d', 'bar.d', 'be', 'br.q', 'de.s', 'cs', 'bat-smg', 'bs.n', 'ar.d', 'ar.v', 'ckb', 'bi.b', 'bar', 'cy.b', 'bg.mw', 'co.q', 'ar.q', 'co', 'ast.b', 'az.mw', 'de', 'br', 'da.b', 'ab', 'ch.d', 'diq.d', 'bo.d', 'ar.s', 'ee.d', 'br.d', 'cho', 'cv.b', 'el.mw', 'el', 'bs.b', 'af', 'de.d', 'en.d', 'bo.b', 'an.d', 'ar', 'als.mw', 'ce', 'cv.d', 'af.v', 'cv.mw', 'bn.d', 'bo', 'bs.q', 'bn', 'csb.d', 'de.n', 'dv', 'ace', 'aa.b', 'bm', 'ca', 'ang.b', 'de.mw', 'ast', 'bg.d', 'ca.n', 'be.d', 'bg.q', 'ee', 'ay.d', 'bn.s'

                                                                                

+-----------------+
|sum(content_size)|
+-----------------+
|     299984572954|
+-----------------+





+------------+--------------------+------------+------------+
|project_name|          page_title|num_requests|content_size|
+------------+--------------------+------------+------------+
|          en|      Special:Random|      405305|   218224631|
|          en|      Special:Search|      222902|   561104989|
|          en|           Main_Page|      222302|  5025224410|
|          en|           404_error|       42051|   135095134|
|          en|Special:Export/Wi...|       32765|   107400209|
+------------+--------------------+------------+------------+



                                                                                