In [4]:
# montar Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
!apt-get update
!apt-get install -y openjdk-11-jdk-headless

0% [Working]            Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
0% [Connecting to archive.ubuntu.com] [Connected to cloud.r-project.org (13.225                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com] [Connected to r2u.stat.illinois.edu (192.                                                                               Hit:3 https://cli.github.com/packages stable InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.83)] [Connected to r2u.stat.ill                                                                               Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Waiting for headers] [Waiting for headers] [Connected to ppa.launchpadconte                                                                               Hit:5 http://archive.ubuntu.com/ubuntu jam

In [2]:
!curl -L -o spark.tgz https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar -xzf spark.tgz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  381M  100  381M    0     0   512k      0  0:12:42  0:12:42 --:--:--  545k


In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"
os.environ["PATH"] += ":/content/spark-3.5.0-bin-hadoop3/bin"

In [6]:
!pip install pyspark



In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("COVID_Colab") \
    .getOrCreate()

spark

In [8]:
path_drive = "/content/drive/MyDrive/data/covid19/Casos_positivos_de_COVID-19_en_Colombia-100K.csv"

df = spark.read.option("header", True).option("inferSchema", True).csv(path_drive)
df.printSchema()
df.show(5, False)


root
 |-- fecha reporte web: string (nullable = true)
 |-- ID de caso: integer (nullable = true)
 |-- Fecha de notificación: string (nullable = true)
 |-- Código DIVIPOLA departamento: integer (nullable = true)
 |-- Nombre departamento: string (nullable = true)
 |-- Código DIVIPOLA municipio: integer (nullable = true)
 |-- Nombre municipio: string (nullable = true)
 |-- Edad: integer (nullable = true)
 |-- Unidad de medida de edad: integer (nullable = true)
 |-- Sexo: string (nullable = true)
 |-- Tipo de contagio: string (nullable = true)
 |-- Ubicación del caso: string (nullable = true)
 |-- Estado: string (nullable = true)
 |-- Código ISO del país: integer (nullable = true)
 |-- Nombre del país: string (nullable = true)
 |-- Recuperado: string (nullable = true)
 |-- Fecha de inicio de síntomas: string (nullable = true)
 |-- Fecha de muerte: string (nullable = true)
 |-- Fecha de diagnóstico: string (nullable = true)
 |-- Fecha de recuperación: string (nullable = true)
 |-- Tipo de r

In [9]:
from pyspark.sql.functions import col

# renombrar espacios / mayúsculas a minúsculas:
for c in df.columns:
    df = df.withColumnRenamed(c, c.strip().lower().replace(" ", "_"))

df.printSchema()


root
 |-- fecha_reporte_web: string (nullable = true)
 |-- id_de_caso: integer (nullable = true)
 |-- fecha_de_notificación: string (nullable = true)
 |-- código_divipola_departamento: integer (nullable = true)
 |-- nombre_departamento: string (nullable = true)
 |-- código_divipola_municipio: integer (nullable = true)
 |-- nombre_municipio: string (nullable = true)
 |-- edad: integer (nullable = true)
 |-- unidad_de_medida_de_edad: integer (nullable = true)
 |-- sexo: string (nullable = true)
 |-- tipo_de_contagio: string (nullable = true)
 |-- ubicación_del_caso: string (nullable = true)
 |-- estado: string (nullable = true)
 |-- código_iso_del_país: integer (nullable = true)
 |-- nombre_del_país: string (nullable = true)
 |-- recuperado: string (nullable = true)
 |-- fecha_de_inicio_de_síntomas: string (nullable = true)
 |-- fecha_de_muerte: string (nullable = true)
 |-- fecha_de_diagnóstico: string (nullable = true)
 |-- fecha_de_recuperación: string (nullable = true)
 |-- tipo_de_r

In [10]:
from pyspark.sql.functions import to_date, col, when

# transformar campo fecha si existe 'fecha_diagnostico' o 'fecha'
# intenta varios nombres comunes
date_cols = [c for c in df.columns if 'fecha' in c]
date_cols

df = df.withColumn("fecha_evento", to_date(col(date_cols[0]), "yyyy-MM-dd"))
# crear columna binaria recommended_flag como ejemplo general
df = df.withColumn("is_severe", when(col("estado") == "Fallecido", 1).otherwise(0))


In [11]:
# casos en 2020
df_2020 = df.filter(col("fecha_evento").between("2020-01-01","2020-12-31"))

# casos en hombres adultos (>40)
df_hombres_adultos = df.filter((col("sexo") == "M") & (col("edad") > 40))


In [12]:
df.createOrReplaceTempView("covid")


In [19]:
# Los 10 dciudades con mas casos:
top_ciudades = df.groupBy("nombre_municipio").count().orderBy(col("count").desc()).limit(10)
top_ciudades.show(10,truncate=False)

+----------------+-----+
|nombre_municipio|count|
+----------------+-----+
|BOGOTA          |30016|
|BARRANQUILLA    |13065|
|CARTAGENA       |8333 |
|CALI            |7747 |
|SOLEDAD         |6233 |
|LETICIA         |2194 |
|MEDELLIN        |2137 |
|TUMACO          |1501 |
|BUENAVENTURA    |1453 |
|QUIBDO          |1367 |
+----------------+-----+



In [16]:
top_depart = df.groupBy("nombre_departamento").count().orderBy(col("count").desc()).limit(10)
top_depart.show(10,truncate=False)

+-------------------+-----+
|nombre_departamento|count|
+-------------------+-----+
|BOGOTA             |30016|
|BARRANQUILLA       |13065|
|ATLANTICO          |10994|
|VALLE              |10404|
|CARTAGENA          |8333 |
|ANTIOQUIA          |4554 |
|NARIÑO             |3520 |
|CUNDINAMARCA       |2827 |
|AMAZONAS           |2317 |
|CHOCO              |1636 |
+-------------------+-----+



In [21]:
# Los 10 días con mas casos:
top_dias = df.groupBy("fecha_evento").count().orderBy(col("count").desc()).limit(10)
top_dias.show(10,False)


+------------+------+
|fecha_evento|count |
+------------+------+
|NULL        |100000|
+------------+------+



In [22]:
# Distribución de casos por edades
from pyspark.sql.functions import when

df_age = df.withColumn("age_group",
    when(col("edad") < 10, "<10")
    .when(col("edad").between(10,19), "10-19")
    .when(col("edad").between(20,29), "20-29")
    .when(col("edad").between(30,39), "30-39")
    .when(col("edad").between(40,49), "40-49")
    .when(col("edad").between(50,59), "50-59")
    .when(col("edad") >= 60, "60+")
    .otherwise("unknown")
)

dist_ages = df_age.groupBy("age_group").count().orderBy("age_group")
dist_ages.show(truncate=False)


+---------+-----+
|age_group|count|
+---------+-----+
|10-19    |7341 |
|20-29    |21937|
|30-39    |23029|
|40-49    |15828|
|50-59    |12857|
|60+      |15035|
|<10      |3973 |
+---------+-----+



In [24]:
# Mi pregunta: Casos por sexo y departamento
cases_sex_dept = df.groupBy("nombre_departamento","sexo").count().orderBy("nombre_departamento","sexo")
cases_sex_dept.show(20,False)

+-------------------+----+-----+
|nombre_departamento|sexo|count|
+-------------------+----+-----+
|AMAZONAS           |F   |894  |
|AMAZONAS           |M   |1423 |
|ANTIOQUIA          |F   |1712 |
|ANTIOQUIA          |M   |2842 |
|ARAUCA             |F   |9    |
|ARAUCA             |M   |70   |
|ATLANTICO          |F   |5005 |
|ATLANTICO          |M   |5989 |
|BARRANQUILLA       |F   |5897 |
|BARRANQUILLA       |M   |7168 |
|BOGOTA             |F   |15052|
|BOGOTA             |M   |14964|
|BOLIVAR            |F   |462  |
|BOLIVAR            |M   |536  |
|BOYACA             |F   |183  |
|BOYACA             |M   |195  |
|CALDAS             |F   |154  |
|CALDAS             |M   |108  |
|CAQUETA            |F   |14   |
|CAQUETA            |M   |25   |
+-------------------+----+-----+
only showing top 20 rows



In [26]:
drive_out = "/content/drive/MyDrive/data/covid19/covid-output/"

top_depart.coalesce(1).write.mode("overwrite").option("header", True).csv(drive_out + "top_departamentos_csv")
top_depart.write.mode("overwrite").parquet(drive_out + "top_departamentos_parquet")

top_ciudades.coalesce(1).write.mode("overwrite").option("header", True).csv(drive_out + "top_ciudades_csv")
top_ciudades.write.mode("overwrite").parquet(drive_out + "top_ciudades_parquet")

top_dias.coalesce(1).write.mode("overwrite").option("header", True).csv(drive_out + "top_dias_csv")
top_dias.write.mode("overwrite").parquet(drive_out + "top_dias_parquet")

dist_ages.coalesce(1).write.mode("overwrite").option("header", True).csv(drive_out + "dist_ages_csv")
dist_ages.write.mode("overwrite").parquet(drive_out + "dist_ages_parquet")

cases_sex_dept.coalesce(1).write.mode("overwrite").option("header", True).csv(drive_out + "cases_sex_dept_csv")
cases_sex_dept.write.mode("overwrite").parquet(drive_out + "cases_sex_dept_parquet")