In [2]:
import pyspark

#Creamos una sesion de spark
from pyspark.sql import SparkSession

#Creamos un objeto de spark session
spark = SparkSession.builder.appName("pysparkdf").getOrCreate()

In [3]:
from pyspark.sql.functions import desc, asc, count, max, when, mean, rank
from pyspark.sql.window import Window

In [7]:
#Leemos un cvs
df21 = spark.read.option("header", "true").csv("../datasets/world-happiness-report-2021.csv")
dfAll = spark.read.option("header", "true").csv("../datasets/world-happiness-report.csv")

In [6]:
#Esquema de los datos
df21.printSchema()

root
 |-- Country name: string (nullable = true)
 |-- Regional indicator: string (nullable = true)
 |-- Ladder score: string (nullable = true)
 |-- Standard error of ladder score: string (nullable = true)
 |-- upperwhisker: string (nullable = true)
 |-- lowerwhisker: string (nullable = true)
 |-- Logged GDP per capita: string (nullable = true)
 |-- Social support: string (nullable = true)
 |-- Healthy life expectancy: string (nullable = true)
 |-- Freedom to make life choices: string (nullable = true)
 |-- Generosity: string (nullable = true)
 |-- Perceptions of corruption: string (nullable = true)
 |-- Ladder score in Dystopia: string (nullable = true)
 |-- Explained by: Log GDP per capita: string (nullable = true)
 |-- Explained by: Social support: string (nullable = true)
 |-- Explained by: Healthy life expectancy: string (nullable = true)
 |-- Explained by: Freedom to make life choices: string (nullable = true)
 |-- Explained by: Generosity: string (nullable = true)
 |-- Explained 

1. ¿Cuál es el país más “feliz” del 2021 según la data? (considerar que la columna “Ladder score”
mayor número más feliz es el país)

In [90]:
happiestCountry21 = df21.select('Country name', 'Ladder score').orderBy(desc('Ladder score')).show(1)

+------------+------------+
|Country name|Ladder score|
+------------+------------+
|     Finland|       7.842|
+------------+------------+
only showing top 1 row



In [69]:
#2. ¿Cuál es el país más “feliz” del 2021 por continente según la data?

# Agrupar los datos por "Regional indicator" y encontrar la puntuación máxima de felicidad
max_scores = df21.groupBy("Regional indicator").agg(max("Ladder score").alias("max_score"))

# Unir el DataFrame original con los resultados de la agrupación para obtener los detalles del país
top_countries = df21.join(max_scores, ["Regional indicator"], "inner") \
    .where(df21["Ladder score"] == max_scores["max_score"]) \
    .select("Regional indicator", "Country name", "Ladder score") \
    #.withColumnRenamed("Ladder score", "Happiest")

# Mostrar los resultados
top_countries.show(truncate=False)

+----------------------------------+------------------------+------------+
|Regional indicator                |Country name            |Ladder score|
+----------------------------------+------------------------+------------+
|Western Europe                    |Finland                 |7.842       |
|North America and ANZ             |New Zealand             |7.277       |
|Middle East and North Africa      |Israel                  |7.157       |
|Latin America and Caribbean       |Costa Rica              |7.069       |
|Central and Eastern Europe        |Czech Republic          |6.965       |
|East Asia                         |Taiwan Province of China|6.584       |
|Southeast Asia                    |Singapore               |6.377       |
|Commonwealth of Independent States|Uzbekistan              |6.179       |
|Sub-Saharan Africa                |Mauritius               |6.049       |
|South Asia                        |Nepal                   |5.269       |
+------------------------

In [8]:
#Esquema de los datos
dfAll.printSchema()

root
 |-- Country name: string (nullable = true)
 |-- year: string (nullable = true)
 |-- Life Ladder: string (nullable = true)
 |-- Log GDP per capita: string (nullable = true)
 |-- Social support: string (nullable = true)
 |-- Healthy life expectancy at birth: string (nullable = true)
 |-- Freedom to make life choices: string (nullable = true)
 |-- Generosity: string (nullable = true)
 |-- Perceptions of corruption: string (nullable = true)
 |-- Positive affect: string (nullable = true)
 |-- Negative affect: string (nullable = true)



In [77]:
#3. ¿Cuál es el país que más veces ocupó el primer lugar en todos los años?

max_of_year = dfAll.groupBy("year").agg(max("Life Ladder").alias("max_score"))

top_of_year = dfAll.join(max_of_year, ["year"], "inner") \
.where(dfAll["Life Ladder"] == max_of_year["max_score"]) \
.select("year", "max_score", "Country name") \
.orderBy(desc("year"))

country_counts = top_of_year.groupBy("Country name").agg(count("Country name").alias("count")) \
.orderBy(desc("count")).show()

+------------+-----+
|Country name|count|
+------------+-----+
|     Denmark|    7|
|     Finland|    6|
|      Norway|    1|
| Switzerland|    1|
|      Canada|    1|
+------------+-----+



In [55]:
#4. ¿Qué puesto de Felicidad tiene el país con mayor GDP del 2020?

df20 = dfAll.filter(dfAll.year == "2020")
df20.withColumn("Log GDP per capita",df20['Log GDP per capita'].cast("double")) \
.withColumn("Life Ladder", df20["Life Ladder"].cast("double")) \
.withColumn("Rank", rank().over(Window.orderBy(desc("Life Ladder")))) \
.orderBy(desc("Log GDP per capita")) \
.select("Log GDP per capita", "Country name", "Life Ladder", "Rank").show(1)



+------------------+------------+-----------+----+
|Log GDP per capita|Country name|Life Ladder|Rank|
+------------------+------------+-----------+----+
|            11.323|     Ireland|      7.035|  13|
+------------------+------------+-----------+----+
only showing top 1 row



In [83]:
#5. ¿En que porcentaje a variado a nivel mundial el GDP promedio del 2020 respecto al 2021? ¿Aumentó o disminuyó?

GDPmean20 = df20.filter(df20["Log GDP per capita"].isNotNull()) \
.agg(mean("Log GDP per capita").alias("GDPmean20")) \
.collect()[0]["GDPmean20"]

GDPmean21 = df21.filter(df21["Logged GDP per capita"].isNotNull()) \
.agg(mean("Logged GDP per capita").alias("GDPmean21")) \
.collect()[0]["GDPmean21"]

variation_percent = ((GDPmean21 - GDPmean20) / GDPmean20) * 100
rounded_variation_percent = abs(round(variation_percent, 2))

if variation_percent > 0:
    print(f"El GDP promedio aumentó un {rounded_variation_percent} % de 2020 a 2021.")
else:
    print(f"El GDP promedio disminuyó un {rounded_variation_percent} % de 2020 a 2021.")

El GDP promedio disminuyó un 3.27 % de 2020 a 2021.


In [84]:
# 6. ¿Cuál es el país con mayor expectativa de vide (“Healthy life expectancy at birth”)? Y ¿Cuánto tenia en ese indicador en el 2019?

helthy_country = df21.select("Country name").orderBy(desc("Healthy life expectancy")).first()[0]
print(helthy_country)
dfAll.where(dfAll["Country name"] == helthy_country).where(dfAll["year"] == "2019").select("Healthy life expectancy at birth", "Life Ladder").show()

Singapore
+--------------------------------+-----------+
|Healthy life expectancy at birth|Life Ladder|
+--------------------------------+-----------+
|                          77.100|      6.378|
+--------------------------------+-----------+

