In [127]:
import os
import re
import pandas as pd
import numpy as np

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import types

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.window import Window



In [128]:
conf = (

    SparkConf()
    .setAppName(u"[Julia] Examen Tecnologias")
    .set("spark.executor.memory", "7g")
    .set("spark.executor.cores", "5")
    .set("spark.default.parallelism", 400)
    .set("spark.sql.shuffle.partitions", 400) 
    .set("spark.dynamicAllocation.maxExecutors", 2) 
)


In [129]:
spark = (

    SparkSession.builder
    .config(conf=conf)
    .enableHiveSupport()
    .getOrCreate()

)


In [130]:
museos = (
    
    spark.read
    .options(header=True, inferSchema=True, sep = ",") ##INFERIR ESQEUMA
    .csv('/datos/datasets_examen_1920/museos.csv')
).cache()


In [17]:
museos.limit(5).toPandas()

Unnamed: 0,id,address,description,featurecount,fee,longitude,latitude,lengthofvisit,museumname,phonenum,rank,rating,reviewcount,totalthingstodo
0,2,"Museumstraat 1, 1071 XX Amsterdam, The Netherl...",The Rijksmuseum is the museum of the Netherlan...,12,Yes,4.885185,52.36001,More than 3 hours,Rijksmuseum,+31 20 674 7000,1,4.5,25042,451
1,6,"Museumplein 6, 1071 DJ Amsterdam, The Netherlands",Discover the world's largest collection of wor...,12,Yes,4.881579,52.358433,1-2 hours,Van Gogh Museum,+31 20 570 5200,3,4.5,33383,451
2,10,"Barer Strasse 27 | Eingang Theresienstrae, 803...",Housing much of the city's most famous artwork...,4,Yes,11.569983,48.14952,More than 3 hours,Alte Pinakothek,+49 0 89 23805-216,16,4.5,1715,293
3,14,"Bygdoeynesveien 39, Oslo 0286, Norway","The Fram Museum is located at Bygdy, a short d...",2,,10.697917,59.90297,1-2 hours,Fram Polar Ship Museum,+47 23 28 29 50,3,4.5,3332,386
4,18,"Cromwell Road | South Kensington, London SW7 2...",The world's greatest museum of art and design....,9,No,-0.182833,51.49508,2-3 hours,V&A - Victoria and Albert Museum,020 7942 2000,3,4.5,18030,1443


In [16]:
museos.printSchema()

root
 |-- id: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- description: string (nullable = true)
 |-- featurecount: string (nullable = true)
 |-- fee: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- lengthofvisit: string (nullable = true)
 |-- museumname: string (nullable = true)
 |-- phonenum: string (nullable = true)
 |-- rank: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- reviewcount: string (nullable = true)
 |-- totalthingstodo: string (nullable = true)



In [131]:
museos = (
    museos
    .select('id', 'longitude', 'latitude', 'rating')
)

In [39]:
museos.printSchema()

root
 |-- id: integer (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- rating: double (nullable = true)



In [29]:
museos.limit(5).toPandas()

Unnamed: 0,id,longitude,latitude,rating
0,2,4.885185,52.36001,4.5
1,6,4.881579,52.358433,4.5
2,10,11.569983,48.14952,4.5
3,14,10.697917,59.90297,4.5
4,18,-0.182833,51.49508,4.5


In [40]:
museos.count()

1013

In [132]:
museos = (
    museos
    .filter("id is not null or id !=''")
    .filter("longitude is not null or longitude !=''")
    .filter("latitude is not null or latitude !=''")
    .filter("rating is not null or rating !=''")
)

In [86]:
museos.count()

972

In [43]:
museos.limit(5).toPandas()

Unnamed: 0,id,longitude,latitude,rating
0,2,4.885185,52.36001,4.5
1,6,4.881579,52.358433,4.5
2,10,11.569983,48.14952,4.5
3,14,10.697917,59.90297,4.5
4,18,-0.182833,51.49508,4.5


In [52]:
museos_analisis = (
    museos
    .agg(
        F.min('rating').alias("Minimo Rating"),
        F.max('rating').alias("Maximo Rating"),
        F.mean('rating').alias("Rating Medio"))

).show()

+-------------+-------------+-----------------+
|Minimo Rating|Maximo Rating|     Rating Medio|
+-------------+-------------+-----------------+
|          2.5|          5.0|4.440843621399177|
+-------------+-------------+-----------------+



In [133]:
localizaciones = (
    
    spark.read
    .options(header=True, inferSchema=True, sep = ",") ##INFERIR ESQEUMA
    .csv('/datos/datasets_examen_1920/localizaciones.csv')
).cache()

In [54]:
localizaciones.limit(5).toPandas()

Unnamed: 0,id,housenumber,street,city,state,zip,country
0,26,House Number Not Found,Street Not Found,,"Washington, D.C.",20224,United States of America
1,150,555,Pennsylvania Avenue Northwest,,"Washington, D.C.",20001,United States of America
2,202,1391,D Street Northeast,,"Washington, D.C.",20002,United States of America
3,714,555,Pennsylvania Avenue Northwest,,"Washington, D.C.",20001,United States of America
4,742,House Number Not Found,Street Not Found,,"Washington, D.C.",20546,United States of America


In [55]:
localizaciones.printSchema()

root
 |-- id: integer (nullable = true)
 |-- housenumber: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- country: string (nullable = true)



In [58]:
count_esp = localizaciones.filter("country='Spain'").count()

In [59]:
print("Localizaciones con el pais Espana :{}".format(count_esp))

Localizaciones con el pais Espana :30


In [134]:
localizaciones_join = localizaciones.select("id","city", "country")

In [135]:
museos = (
    museos
    .join(localizaciones_join, "id", 'left') #para manetener todo los museos
)

In [136]:
museos.limit(5).toPandas()

Unnamed: 0,id,longitude,latitude,rating,city,country
0,2,4.885185,52.36001,4.5,Amsterdam,The Netherlands
1,6,4.881579,52.358433,4.5,Amsterdam,The Netherlands
2,10,11.569983,48.14952,4.5,Munich,Germany
3,14,10.697917,59.90297,4.5,Oslo,Norway
4,18,-0.182833,51.49508,4.5,London,United Kingdom


In [137]:
museos.count()

972

In [99]:
mas_museos = (
    museos
    .filter("city!='None'") #eliminamos aquellos registros que no tengan datos de ciudad
    .groupBy("city")
    .count()
    .orderBy(F.desc('count'))
    .first()
)

In [138]:
print("La ciudad con mas museos es {} con {} museos". format(mas_museos[0], mas_museos[1]))

La ciudad con mas museos es New York con 29 museos


In [140]:
mejor_ratings = (
    museos
    .filter("country!='None'") #eliminamos aquellos registros que no tengan datos de ciudad
    .groupBy("country")
    .agg(F.mean("rating").alias("Ranking Medio"))
    .orderBy(F.desc("Ranking Medio"))
)

In [143]:
print("Los cinco paises con mejor rating medio son: ")

Los cinco paises con mejor rating medio son: 


In [142]:
mejor_ratings.limit(5).toPandas()

Unnamed: 0,country,Ranking Medio
0,Israel,4.625
1,Mexico,4.6
2,Turkey,4.555556
3,United States of America,4.531532
4,Morocco,4.5


In [126]:
spark.stop()