In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Create a SparkSession
spark = SparkSession.builder.appName("App").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

23/12/09 14:38:40 WARN Utils: Your hostname, user-HP-EliteBook-840-G7-Notebook-PC resolves to a loopback address: 127.0.1.1; using 192.168.1.141 instead (on interface wlp0s20f3)


In [4]:
# Exercise 3

inputDf = spark.read.format("csv").option("delimiter", " ").load("pagecounts-20100806-030000")
inputDf = inputDf \
    .withColumnRenamed("_c0", "project_name") \
    .withColumnRenamed("_c1", "page_title") \
    .withColumnRenamed("_c2", "num_requests") \
    .withColumnRenamed("_c3", "content_size") \
    .withColumn("num_requests", col("num_requests").cast("long")) \
    .withColumn("content_size", col("content_size").cast("long"))

print("Project summary")
projectSummary = inputDf.groupBy("project_name") \
    .agg(
        count("page_title").alias("num_pages"),
        sum("content_size").alias("content_size"),
        avg("num_requests").alias("mean_requests"))
projectSummary.show()

print("Most visited")
inputDf.join(projectSummary.select("project_name", "mean_requests"), "project_name") \
    .filter(col("num_requests") > col("mean_requests")).show()
#    .orderBy(col("num_requests").desc()) \
    


Project summary
+------------+---------+------------+------------------+
|project_name|num_pages|content_size|     mean_requests|
+------------+---------+------------+------------------+
|     cbk-zam|      174|     3976932|1.1781609195402298|
|        co.b|       13|       59223|1.1538461538461537|
|        cs.n|       95|      423117| 1.063157894736842|
|          en|  2245124|299984572954| 4.458697158820627|
|  be-x-old.d|        1|        6584|               1.0|
|          cr|       38|      729029|1.3421052631578947|
|        as.d|        7|       34520|1.4285714285714286|
|        dv.n|        1|        6584|               1.0|
|        cr.d|        1|       13604|               1.0|
|         crh|      195|     3587699|1.1384615384615384|
|       ast.q|        4|       37021|               1.0|
|       als.n|        1|        6584|               1.0|
|       ang.q|        9|       47678|               1.0|
|        am.d|       14|      119050|               1.0|
|        af.q| 

In [5]:
# Exercise 4

columns = ["id","name","surname","age","country","local_phone"]
input_data = [(1,"Simón","Bolivar",47,"VEN","489 895 965"),
    (2,"Fidel","Castro",90,"CU","956 268 348"),
    (3,"Jose","Doroteo",45,"MEX","985 621 444"),
    (4,"Ernesto","Guevara",39,"AR","895 325 481"),
    (5,"Hugo","Chávez",58,"VE","489 895 965"),
    (6,"Camilo","Cienfuegos",27,"CUB","956 268 348"),
    (7,"Emiliano","Zapata",39,"ME","985 621 444"),
    (8,"Juan Domingo","Perón",78,"ARG","985 621 444"),
  ]


from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

df = spark.createDataFrame(input_data).toDF(*columns)
df.show()

states = {"VEN VE":"+58", "CU CUB":"+53", "ME MEX":"+52", "AR ARG":"+54"}
transformed_states = {}

for key, value in states.items():
    codes = key.split()
    for code in codes:
        transformed_states[code] = value

# print(transformed_states)

def country_convert_to_udf(code):
    return transformed_states[code]

country_convert = udf(country_convert_to_udf, StringType())

mod_df = df.withColumn('phone_code', country_convert(col('country')))
mod_df.show()

+---+------------+----------+---+-------+-----------+
| id|        name|   surname|age|country|local_phone|
+---+------------+----------+---+-------+-----------+
|  1|       Simón|   Bolivar| 47|    VEN|489 895 965|
|  2|       Fidel|    Castro| 90|     CU|956 268 348|
|  3|        Jose|   Doroteo| 45|    MEX|985 621 444|
|  4|     Ernesto|   Guevara| 39|     AR|895 325 481|
|  5|        Hugo|    Chávez| 58|     VE|489 895 965|
|  6|      Camilo|Cienfuegos| 27|    CUB|956 268 348|
|  7|    Emiliano|    Zapata| 39|     ME|985 621 444|
|  8|Juan Domingo|     Perón| 78|    ARG|985 621 444|
+---+------------+----------+---+-------+-----------+

+---+------------+----------+---+-------+-----------+----------+
| id|        name|   surname|age|country|local_phone|phone_code|
+---+------------+----------+---+-------+-----------+----------+
|  1|       Simón|   Bolivar| 47|    VEN|489 895 965|       +58|
|  2|       Fidel|    Castro| 90|     CU|956 268 348|       +53|
|  3|        Jose|   Dorot