In [5]:
import pandas as pd
import requests
import os

In [7]:
api_url = "https://www.datos.gov.co/resource/nudc-7mev.json?$limit=50000"
print(f"📥 Extrayendo datos desde: {api_url}")

try:
    response = requests.get(api_url)
    response.raise_for_status()  # Lanza un error si la petición falla (ej: 404)
    data = response.json()
    df_raw = pd.DataFrame(data)
    print(f"✅ ¡Extracción exitosa! Se cargaron {len(df_raw)} filas.")
    display(df_raw.head())

except requests.exceptions.RequestException as e:
    print(f"❌ Error al extraer los datos: {e}")
    df_raw = pd.DataFrame() # Creamos un dataframe vacío para evitar errores posteriores

except Exception as e:
    print(f"❌ Ocurrió un error inesperado: {e}")
    df_raw = pd.DataFrame()

📥 Extrayendo datos desde: https://www.datos.gov.co/resource/nudc-7mev.json?$limit=50000
✅ ¡Extracción exitosa! Se cargaron 14585 filas.


Unnamed: 0,a_o,c_digo_municipio,municipio,c_digo_departamento,departamento,c_digo_etc,etc,poblaci_n_5_16,tasa_matriculaci_n_5_16,cobertura_neta,...,reprobaci_n_primaria,reprobaci_n_secundaria,reprobaci_n_media,repitencia,repitencia_transici_n,repitencia_primaria,repitencia_secundaria,repitencia_media,tama_o_promedio_de_grupo,sedes_conectadas_a_internet
0,2023,5004,Abriaquí,5,Antioquia,3758,Antioquia (ETC),503,62.62,62.62,...,1.96,16.51,2.04,9.52,0.0,10.46,13.76,2.04,,
1,2023,95025,El Retorno,95,Guaviare,3830,Guaviare (ETC),4438,53.27,53.27,...,7.11,9.39,1.75,9.34,6.95,11.84,8.48,3.16,,
2,2023,95200,Miraflores,95,Guaviare,3830,Guaviare (ETC),2014,32.52,32.52,...,6.93,14.13,7.81,8.65,6.67,9.04,10.25,1.54,,
3,2023,97001,Mitú,97,Vaupés,3831,Vaupés (ETC),10986,59.57,59.57,...,4.04,8.33,4.6,16.18,7.75,21.04,13.84,7.18,,
4,2023,97161,Caruru,97,Vaupés,3831,Vaupés (ETC),1228,51.3,51.3,...,7.32,15.28,7.27,9.24,2.86,7.62,14.85,3.64,,


## Aprendiendo Pandas

In [8]:
df_raw.info() ## ver información general de la base

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14585 entries, 0 to 14584
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   a_o                          14585 non-null  object
 1   c_digo_municipio             14585 non-null  object
 2   municipio                    14585 non-null  object
 3   c_digo_departamento          14585 non-null  object
 4   departamento                 14585 non-null  object
 5   c_digo_etc                   14585 non-null  object
 6   etc                          14585 non-null  object
 7   poblaci_n_5_16               14579 non-null  object
 8   tasa_matriculaci_n_5_16      14470 non-null  object
 9   cobertura_neta               14474 non-null  object
 10  cobertura_neta_transici_n    14533 non-null  object
 11  cobertura_neta_primaria      14494 non-null  object
 12  cobertura_neta_secundaria    14491 non-null  object
 13  cobertura_neta_media         14

Generemos una "muestra" de conjunto de datos
1. Tomando los primeros registros del conjunto de datos.

In [9]:
df_raw.head(15) ## metodo

Unnamed: 0,a_o,c_digo_municipio,municipio,c_digo_departamento,departamento,c_digo_etc,etc,poblaci_n_5_16,tasa_matriculaci_n_5_16,cobertura_neta,...,reprobaci_n_primaria,reprobaci_n_secundaria,reprobaci_n_media,repitencia,repitencia_transici_n,repitencia_primaria,repitencia_secundaria,repitencia_media,tama_o_promedio_de_grupo,sedes_conectadas_a_internet
0,2023,5004,Abriaquí,5,Antioquia,3758,Antioquia (ETC),503,62.62,62.62,...,1.96,16.51,2.04,9.52,0.0,10.46,13.76,2.04,,
1,2023,95025,El Retorno,95,Guaviare,3830,Guaviare (ETC),4438,53.27,53.27,...,7.11,9.39,1.75,9.34,6.95,11.84,8.48,3.16,,
2,2023,95200,Miraflores,95,Guaviare,3830,Guaviare (ETC),2014,32.52,32.52,...,6.93,14.13,7.81,8.65,6.67,9.04,10.25,1.54,,
3,2023,97001,Mitú,97,Vaupés,3831,Vaupés (ETC),10986,59.57,59.57,...,4.04,8.33,4.6,16.18,7.75,21.04,13.84,7.18,,
4,2023,97161,Caruru,97,Vaupés,3831,Vaupés (ETC),1228,51.3,51.3,...,7.32,15.28,7.27,9.24,2.86,7.62,14.85,3.64,,
5,2023,97511,Pacoa,97,Vaupés,3831,Vaupés (ETC),1678,54.29,54.29,...,0.0,0.0,0.0,23.21,25.86,30.3,14.25,6.59,,
6,2023,97666,Taraira,97,Vaupés,3831,Vaupés (ETC),814,22.6,22.6,...,0.0,0.0,0.0,21.58,13.33,22.29,0.0,0.0,,
7,2023,97777,Papunaua,97,Vaupés,3831,Vaupés (ETC),209,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
8,2023,97889,Yavaraté,97,Vaupés,3831,Vaupés (ETC),561,47.95,47.95,...,0.0,0.0,0.0,12.33,3.13,19.5,3.95,4.0,,
9,2023,99001,Puerto Carreño,99,Vichada,3832,Vichada (ETC),4641,144.52,144.52,...,12.96,18.84,8.81,11.18,0.82,13.25,10.45,6.39,,


2. Ahora, usamos los índices para mostrar una fracción del conjunto de datos. 

In [10]:
df_raw.iloc[20:30] # (ver de 20 al 30) Pensemos en estos indices como un intervalo semiabierto [20,30)

Unnamed: 0,a_o,c_digo_municipio,municipio,c_digo_departamento,departamento,c_digo_etc,etc,poblaci_n_5_16,tasa_matriculaci_n_5_16,cobertura_neta,...,reprobaci_n_primaria,reprobaci_n_secundaria,reprobaci_n_media,repitencia,repitencia_transici_n,repitencia_primaria,repitencia_secundaria,repitencia_media,tama_o_promedio_de_grupo,sedes_conectadas_a_internet
20,2023,94001,Inírida,94,Guainía,3829,Guainía (ETC),9673,77.42,77.33,...,13.35,15.6,7.33,16.19,8.47,18.36,17.25,6.11,,
21,2023,91798,Tarapacá,91,Amazonas,3828,Amazonas (ETC),1327,50.04,50.04,...,1.26,24.2,63.16,26.09,1.28,28.87,29.3,31.58,,
22,2023,91669,Puerto Santander,91,Amazonas,3828,Amazonas (ETC),618,63.75,63.75,...,3.09,7.87,0.0,16.67,1.85,19.69,16.85,16.67,,
23,2023,91540,Puerto Nariño,91,Amazonas,3828,Amazonas (ETC),3232,55.07,55.07,...,13.51,15.47,5.32,15.37,5.85,18.61,16.75,3.72,,
24,2023,91536,Puerto Arica,91,Amazonas,3828,Amazonas (ETC),274,83.94,83.94,...,0.67,0.0,0.0,11.49,0.0,12.75,10.45,17.39,,
25,2023,91530,Puerto Alegría,91,Amazonas,3828,Amazonas (ETC),211,65.4,65.4,...,9.38,6.25,0.0,18.62,29.41,19.79,9.38,0.0,,
26,2023,91460,Miriti - Paraná,91,Amazonas,3828,Amazonas (ETC),571,49.39,49.39,...,0.0,0.0,0.0,12.5,17.65,11.05,18.0,2.63,,
27,2023,91430,La Victoria,91,Amazonas,3828,Amazonas (ETC),169,14.2,14.2,...,0.0,0.0,0.0,16.67,20.0,15.79,0.0,0.0,,
28,2023,91407,La Pedrera,91,Amazonas,3828,Amazonas (ETC),1203,67.17,67.17,...,0.6,5.73,4.62,12.97,2.63,20.24,4.85,3.08,,
29,2023,91405,La Chorrera,91,Amazonas,3828,Amazonas (ETC),1197,50.04,50.04,...,11.62,8.64,14.0,26.42,5.56,24.77,37.27,12.0,,


In [11]:
df_raw.iloc[20:30] ## Acá si tuvo encuenta el 30, intervalo cerrado [20,30]

Unnamed: 0,a_o,c_digo_municipio,municipio,c_digo_departamento,departamento,c_digo_etc,etc,poblaci_n_5_16,tasa_matriculaci_n_5_16,cobertura_neta,...,reprobaci_n_primaria,reprobaci_n_secundaria,reprobaci_n_media,repitencia,repitencia_transici_n,repitencia_primaria,repitencia_secundaria,repitencia_media,tama_o_promedio_de_grupo,sedes_conectadas_a_internet
20,2023,94001,Inírida,94,Guainía,3829,Guainía (ETC),9673,77.42,77.33,...,13.35,15.6,7.33,16.19,8.47,18.36,17.25,6.11,,
21,2023,91798,Tarapacá,91,Amazonas,3828,Amazonas (ETC),1327,50.04,50.04,...,1.26,24.2,63.16,26.09,1.28,28.87,29.3,31.58,,
22,2023,91669,Puerto Santander,91,Amazonas,3828,Amazonas (ETC),618,63.75,63.75,...,3.09,7.87,0.0,16.67,1.85,19.69,16.85,16.67,,
23,2023,91540,Puerto Nariño,91,Amazonas,3828,Amazonas (ETC),3232,55.07,55.07,...,13.51,15.47,5.32,15.37,5.85,18.61,16.75,3.72,,
24,2023,91536,Puerto Arica,91,Amazonas,3828,Amazonas (ETC),274,83.94,83.94,...,0.67,0.0,0.0,11.49,0.0,12.75,10.45,17.39,,
25,2023,91530,Puerto Alegría,91,Amazonas,3828,Amazonas (ETC),211,65.4,65.4,...,9.38,6.25,0.0,18.62,29.41,19.79,9.38,0.0,,
26,2023,91460,Miriti - Paraná,91,Amazonas,3828,Amazonas (ETC),571,49.39,49.39,...,0.0,0.0,0.0,12.5,17.65,11.05,18.0,2.63,,
27,2023,91430,La Victoria,91,Amazonas,3828,Amazonas (ETC),169,14.2,14.2,...,0.0,0.0,0.0,16.67,20.0,15.79,0.0,0.0,,
28,2023,91407,La Pedrera,91,Amazonas,3828,Amazonas (ETC),1203,67.17,67.17,...,0.6,5.73,4.62,12.97,2.63,20.24,4.85,3.08,,
29,2023,91405,La Chorrera,91,Amazonas,3828,Amazonas (ETC),1197,50.04,50.04,...,11.62,8.64,14.0,26.42,5.56,24.77,37.27,12.0,,


3. Muestra aleatoria

In [12]:
df_raw.sample(frac = 0.01, random_state=123456)

Unnamed: 0,a_o,c_digo_municipio,municipio,c_digo_departamento,departamento,c_digo_etc,etc,poblaci_n_5_16,tasa_matriculaci_n_5_16,cobertura_neta,...,reprobaci_n_primaria,reprobaci_n_secundaria,reprobaci_n_media,repitencia,repitencia_transici_n,repitencia_primaria,repitencia_secundaria,repitencia_media,tama_o_promedio_de_grupo,sedes_conectadas_a_internet
10516,2014,50330,Mesetas,50,Meta,3796,Meta (ETC),2904,76.41,89.01,...,0,0,0,0,0,0,0,0,25.59,1.75
2152,2022,5628,Sabanalarga,5,Antioquia,3758,Antioquia (ETC),2046,79.33,79.33,...,0.73,0.87,0,8.32,0,12.39,6.09,1.38,,
9454,2015,41660,Saladoblanco,41,Huila,3790,Huila (ETC),3108,86.87,86.87,...,5.56,0.47,0.45,2.68,1.95,3.96,1.29,0.45,33.45,12.5
10130,2014,97889,Yavaraté,97,Vaupés,3831,Vaupés (ETC),386,75.39,87.65,...,0,0,0,3.61,0,4.9,2.53,0,9.76,16.67
11152,2014,5411,Liborina,5,Antioquia,3758,Antioquia (ETC),1985,105.29,89.66,...,0,0,0,2.87,2.56,2.34,3.62,3.52,27.09,73.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4912,2019,50251,El Castillo,50,Meta,3796,Meta (ETC),1766,95.13,95.13,...,4.17,3.89,1.6,1.44,0,1.19,2.54,0,,
11711,2013,41503,Oporapa,41,Huila,3790,Huila (ETC),3997,68.83,68.8,...,4.47,9.14,8.14,1.26,0,2.09,0.58,0,,11.11
9773,2015,15897,Zetaquira,15,Boyacá,3769,Boyacá (ETC),981,97.96,97.96,...,0,4.52,3.64,0.79,0,1.48,0.53,0,27.27,0
3428,2020,81065,Arauquita,81,Arauca,3824,Arauca (ETC),13580,68.03,68.03,...,7.96,15.51,7.36,6.42,1.93,5.15,10.88,2.03,,


In [13]:
df_raw.sample(n = 10, random_state=123456)

Unnamed: 0,a_o,c_digo_municipio,municipio,c_digo_departamento,departamento,c_digo_etc,etc,poblaci_n_5_16,tasa_matriculaci_n_5_16,cobertura_neta,...,reprobaci_n_primaria,reprobaci_n_secundaria,reprobaci_n_media,repitencia,repitencia_transici_n,repitencia_primaria,repitencia_secundaria,repitencia_media,tama_o_promedio_de_grupo,sedes_conectadas_a_internet
10516,2014,50330,Mesetas,50,Meta,3796,Meta (ETC),2904,76.41,89.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.59,1.75
2152,2022,5628,Sabanalarga,5,Antioquia,3758,Antioquia (ETC),2046,79.33,79.33,...,0.73,0.87,0.0,8.32,0.0,12.39,6.09,1.38,,
9454,2015,41660,Saladoblanco,41,Huila,3790,Huila (ETC),3108,86.87,86.87,...,5.56,0.47,0.45,2.68,1.95,3.96,1.29,0.45,33.45,12.5
10130,2014,97889,Yavaraté,97,Vaupés,3831,Vaupés (ETC),386,75.39,87.65,...,0.0,0.0,0.0,3.61,0.0,4.9,2.53,0.0,9.76,16.67
11152,2014,5411,Liborina,5,Antioquia,3758,Antioquia (ETC),1985,105.29,89.66,...,0.0,0.0,0.0,2.87,2.56,2.34,3.62,3.52,27.09,73.53
2210,2022,5134,Campamento,5,Antioquia,3758,Antioquia (ETC),2223,72.15,72.15,...,10.41,6.99,1.06,9.12,0.0,13.49,6.99,0.53,,
6968,2017,68368,Jesús María,68,Santander,3808,Santander (ETC),657,87.2,87.21,...,4.76,9.42,3.48,0.0,0.0,0.0,0.0,0.0,25.333,8.33
8852,2016,73675,San Antonio,73,Tolima,3815,Tolima (ETC),3510,90.9,90.85,...,6.59,8.57,5.08,0.18,0.0,0.12,0.18,0.68,24.985,56.67
13385,2012,5475,Murindó,5,Antioquia,3758,Antioquia (ETC),1023,95.6,95.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.33,5.0
2067,2022,13458,Montecristo,13,Bolívar,3767,Bolívar (ETC),4308,75.35,75.35,...,10.87,12.12,1.79,13.41,3.79,16.19,13.2,3.57,,


In [14]:
df_raw.tail(20) ## Ver la cola

Unnamed: 0,a_o,c_digo_municipio,municipio,c_digo_departamento,departamento,c_digo_etc,etc,poblaci_n_5_16,tasa_matriculaci_n_5_16,cobertura_neta,...,reprobaci_n_primaria,reprobaci_n_secundaria,reprobaci_n_media,repitencia,repitencia_transici_n,repitencia_primaria,repitencia_secundaria,repitencia_media,tama_o_promedio_de_grupo,sedes_conectadas_a_internet
14565,2011,5107,Briceño,5,Antioquia,3758,Antioquia (ETC),2256,94.1,94.1,...,0.9,0.0,0.0,0.91,0.0,1.28,0.67,0.0,16.28,83.33
14566,2011,5101,Ciudad Bolívar,5,Antioquia,3758,Antioquia (ETC),6651,88.24,88.2,...,0.03,0.0,1.63,4.13,0.0,5.24,3.54,2.77,20.75,97.06
14567,2011,5093,Betulia,5,Antioquia,3758,Antioquia (ETC),4260,95.45,95.5,...,11.11,13.81,9.76,0.08,0.0,0.12,0.0,0.0,26.89,87.8
14568,2011,5091,Betania,5,Antioquia,3758,Antioquia (ETC),2345,89.85,89.9,...,0.0,0.0,0.0,0.48,0.0,0.72,0.17,0.0,30.56,96.3
14569,2011,5088,Bello,5,Antioquia,3760,Bello,91582,82.53,82.4,...,0.12,0.11,1.35,3.12,0.41,2.39,5.03,1.63,28.52,87.5
14570,2011,5086,Belmira,5,Antioquia,3758,Antioquia (ETC),1700,97.76,97.8,...,1.92,0.0,0.58,,0.66,,,5.85,26.95,100.0
14571,2011,5079,Barbosa,5,Antioquia,3758,Antioquia (ETC),10933,82.05,82.1,...,7.55,12.47,13.01,1.02,0.0,0.54,2.07,0.57,29.49,84.0
14572,2011,5059,Armenia,5,Antioquia,3758,Antioquia (ETC),1097,87.97,88.0,...,7.11,2.65,14.74,4.21,0.0,4.3,6.18,0.0,17.79,100.0
14573,2011,5055,Argelia,5,Antioquia,3758,Antioquia (ETC),2460,88.58,88.6,...,0.0,0.0,0.53,,0.0,,2.3,1.27,31.07,82.98
14574,2011,5051,Arboletes,5,Antioquia,3758,Antioquia (ETC),10673,84.85,84.9,...,3.1,3.74,4.91,2.25,0.66,2.41,2.85,0.73,26.36,47.22


Vamos a crear una copia del conjunto de datos, para cambiar los índices. 

In [15]:
df_raw2 = df_raw.copy() ## importante para hacer cambios crear una copia

In [16]:
df_raw.shape ## atributo

(14585, 41)

In [17]:
df_raw2['new_index'] = range(1,df_raw.shape[0]+1) ### Cración de index

In [18]:
df_raw2.head(3)

Unnamed: 0,a_o,c_digo_municipio,municipio,c_digo_departamento,departamento,c_digo_etc,etc,poblaci_n_5_16,tasa_matriculaci_n_5_16,cobertura_neta,...,reprobaci_n_secundaria,reprobaci_n_media,repitencia,repitencia_transici_n,repitencia_primaria,repitencia_secundaria,repitencia_media,tama_o_promedio_de_grupo,sedes_conectadas_a_internet,new_index
0,2023,5004,Abriaquí,5,Antioquia,3758,Antioquia (ETC),503,62.62,62.62,...,16.51,2.04,9.52,0.0,10.46,13.76,2.04,,,1
1,2023,95025,El Retorno,95,Guaviare,3830,Guaviare (ETC),4438,53.27,53.27,...,9.39,1.75,9.34,6.95,11.84,8.48,3.16,,,2
2,2023,95200,Miraflores,95,Guaviare,3830,Guaviare (ETC),2014,32.52,32.52,...,14.13,7.81,8.65,6.67,9.04,10.25,1.54,,,3


In [19]:
df_raw2.set_index('new_index', inplace = True)

In [21]:
df_raw2.head(2)

Unnamed: 0_level_0,a_o,c_digo_municipio,municipio,c_digo_departamento,departamento,c_digo_etc,etc,poblaci_n_5_16,tasa_matriculaci_n_5_16,cobertura_neta,...,reprobaci_n_primaria,reprobaci_n_secundaria,reprobaci_n_media,repitencia,repitencia_transici_n,repitencia_primaria,repitencia_secundaria,repitencia_media,tama_o_promedio_de_grupo,sedes_conectadas_a_internet
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2023,5004,Abriaquí,5,Antioquia,3758,Antioquia (ETC),503,62.62,62.62,...,1.96,16.51,2.04,9.52,0.0,10.46,13.76,2.04,,
2,2023,95025,El Retorno,95,Guaviare,3830,Guaviare (ETC),4438,53.27,53.27,...,7.11,9.39,1.75,9.34,6.95,11.84,8.48,3.16,,


In [22]:
df_raw2.reset_index() ## quitarlo como index

Unnamed: 0,new_index,a_o,c_digo_municipio,municipio,c_digo_departamento,departamento,c_digo_etc,etc,poblaci_n_5_16,tasa_matriculaci_n_5_16,...,reprobaci_n_primaria,reprobaci_n_secundaria,reprobaci_n_media,repitencia,repitencia_transici_n,repitencia_primaria,repitencia_secundaria,repitencia_media,tama_o_promedio_de_grupo,sedes_conectadas_a_internet
0,1,2023,05004,Abriaquí,05,Antioquia,3758,Antioquia (ETC),503,62.62,...,1.96,16.51,2.04,9.52,0,10.46,13.76,2.04,,
1,2,2023,95025,El Retorno,95,Guaviare,3830,Guaviare (ETC),4438,53.27,...,7.11,9.39,1.75,9.34,6.95,11.84,8.48,3.16,,
2,3,2023,95200,Miraflores,95,Guaviare,3830,Guaviare (ETC),2014,32.52,...,6.93,14.13,7.81,8.65,6.67,9.04,10.25,1.54,,
3,4,2023,97001,Mitú,97,Vaupés,3831,Vaupés (ETC),10986,59.57,...,4.04,8.33,4.6,16.18,7.75,21.04,13.84,7.18,,
4,5,2023,97161,Caruru,97,Vaupés,3831,Vaupés (ETC),1228,51.3,...,7.32,15.28,7.27,9.24,2.86,7.62,14.85,3.64,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14580,14581,2011,5036,Angelópolis,5,Antioquia,3758,Antioquia (ETC),1707,78.85,...,3.61,9.5,7.32,0.71,0,0.7,1.08,0,19.57,100
14581,14582,2011,5034,Andes,5,Antioquia,3758,Antioquia (ETC),10244,84.45,...,0.58,0.04,2.69,5.41,0.73,5.53,6.9,4.11,24.43,93.44
14582,14583,2011,5031,Amalfi,5,Antioquia,3758,Antioquia (ETC),5552,97.71,...,0,0,0,,0.83,,9.93,4.47,20.01,53.45
14583,14584,2011,5030,Amagá,5,Antioquia,3758,Antioquia (ETC),6631,78.65,...,6.73,14.46,7.45,0.42,0,0.24,0.91,0,25.05,83.33


Para evaluar si existen registros repetidos, usamos el método `drop_duplicate`

In [23]:
df_raw.drop_duplicates().shape

(14585, 41)

## Coversión de Variables Según su Naturaleza

In [24]:
## si no la hay creela y si la hay transformela

df_raw['tasa_matriculaci_n_5_16'] = df_raw['tasa_matriculaci_n_5_16'].astype(float)

In [25]:
df_raw['tasa_matriculaci_n_5_16'].describe()

count    14470.000000
mean        84.971929
std         18.589112
min          0.000000
25%         74.850000
50%         85.330000
75%         95.380000
max        279.030000
Name: tasa_matriculaci_n_5_16, dtype: float64

Ahora, vamos a generar una estadística de resumen a nivel de departamentos

In [26]:
df_raw.groupby(['c_digo_departamento', 'departamento']) ['tasa_matriculaci_n_5_16'].mean().reset_index() ## aclarar dentro de una lista cuales son las variables agrupadoras

Unnamed: 0,c_digo_departamento,departamento,tasa_matriculaci_n_5_16
0,0,NACIONAL,0.9226
1,5,Antioquia,84.21504
2,8,Atlántico,83.640435
3,11,Bogotá D.C.,96.41
4,11,"Bogotá, D.C.",86.860833
5,13,Bolívar,86.463798
6,15,Boyacá,82.983827
7,17,Caldas,82.187436
8,18,Caquetá,75.105529
9,19,Cauca,79.346987


Como `NACIONAL` no es depto, se quita

In [27]:
df_raw[df_raw['departamento'] != 'NACIONAL']

Unnamed: 0,a_o,c_digo_municipio,municipio,c_digo_departamento,departamento,c_digo_etc,etc,poblaci_n_5_16,tasa_matriculaci_n_5_16,cobertura_neta,...,reprobaci_n_primaria,reprobaci_n_secundaria,reprobaci_n_media,repitencia,repitencia_transici_n,repitencia_primaria,repitencia_secundaria,repitencia_media,tama_o_promedio_de_grupo,sedes_conectadas_a_internet
0,2023,05004,Abriaquí,05,Antioquia,3758,Antioquia (ETC),503,62.62,62.62,...,1.96,16.51,2.04,9.52,0,10.46,13.76,2.04,,
1,2023,95025,El Retorno,95,Guaviare,3830,Guaviare (ETC),4438,53.27,53.27,...,7.11,9.39,1.75,9.34,6.95,11.84,8.48,3.16,,
2,2023,95200,Miraflores,95,Guaviare,3830,Guaviare (ETC),2014,32.52,32.52,...,6.93,14.13,7.81,8.65,6.67,9.04,10.25,1.54,,
3,2023,97001,Mitú,97,Vaupés,3831,Vaupés (ETC),10986,59.57,59.57,...,4.04,8.33,4.6,16.18,7.75,21.04,13.84,7.18,,
4,2023,97161,Caruru,97,Vaupés,3831,Vaupés (ETC),1228,51.30,51.3,...,7.32,15.28,7.27,9.24,2.86,7.62,14.85,3.64,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14580,2011,5036,Angelópolis,5,Antioquia,3758,Antioquia (ETC),1707,78.85,78.9,...,3.61,9.5,7.32,0.71,0,0.7,1.08,0,19.57,100
14581,2011,5034,Andes,5,Antioquia,3758,Antioquia (ETC),10244,84.45,84.5,...,0.58,0.04,2.69,5.41,0.73,5.53,6.9,4.11,24.43,93.44
14582,2011,5031,Amalfi,5,Antioquia,3758,Antioquia (ETC),5552,97.71,97.7,...,0,0,0,,0.83,,9.93,4.47,20.01,53.45
14583,2011,5030,Amagá,5,Antioquia,3758,Antioquia (ETC),6631,78.65,78.7,...,6.73,14.46,7.45,0.42,0,0.24,0.91,0,25.05,83.33


In [28]:
df_raw.query("departamento != 'NACIONAL'")

Unnamed: 0,a_o,c_digo_municipio,municipio,c_digo_departamento,departamento,c_digo_etc,etc,poblaci_n_5_16,tasa_matriculaci_n_5_16,cobertura_neta,...,reprobaci_n_primaria,reprobaci_n_secundaria,reprobaci_n_media,repitencia,repitencia_transici_n,repitencia_primaria,repitencia_secundaria,repitencia_media,tama_o_promedio_de_grupo,sedes_conectadas_a_internet
0,2023,05004,Abriaquí,05,Antioquia,3758,Antioquia (ETC),503,62.62,62.62,...,1.96,16.51,2.04,9.52,0,10.46,13.76,2.04,,
1,2023,95025,El Retorno,95,Guaviare,3830,Guaviare (ETC),4438,53.27,53.27,...,7.11,9.39,1.75,9.34,6.95,11.84,8.48,3.16,,
2,2023,95200,Miraflores,95,Guaviare,3830,Guaviare (ETC),2014,32.52,32.52,...,6.93,14.13,7.81,8.65,6.67,9.04,10.25,1.54,,
3,2023,97001,Mitú,97,Vaupés,3831,Vaupés (ETC),10986,59.57,59.57,...,4.04,8.33,4.6,16.18,7.75,21.04,13.84,7.18,,
4,2023,97161,Caruru,97,Vaupés,3831,Vaupés (ETC),1228,51.30,51.3,...,7.32,15.28,7.27,9.24,2.86,7.62,14.85,3.64,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14580,2011,5036,Angelópolis,5,Antioquia,3758,Antioquia (ETC),1707,78.85,78.9,...,3.61,9.5,7.32,0.71,0,0.7,1.08,0,19.57,100
14581,2011,5034,Andes,5,Antioquia,3758,Antioquia (ETC),10244,84.45,84.5,...,0.58,0.04,2.69,5.41,0.73,5.53,6.9,4.11,24.43,93.44
14582,2011,5031,Amalfi,5,Antioquia,3758,Antioquia (ETC),5552,97.71,97.7,...,0,0,0,,0.83,,9.93,4.47,20.01,53.45
14583,2011,5030,Amagá,5,Antioquia,3758,Antioquia (ETC),6631,78.65,78.7,...,6.73,14.46,7.45,0.42,0,0.24,0.91,0,25.05,83.33


In [29]:
resumen_matriculados_v0=(
    df_raw
    .query("departamento != 'NACIONAL' ")
    .groupby(['c_digo_departamento','departamento'])
    ['tasa_matriculaci_n_5_16']
    .mean()
    .reset_index()
)

In [30]:
resumen_matriculados_v0

Unnamed: 0,c_digo_departamento,departamento,tasa_matriculaci_n_5_16
0,5,Antioquia,84.21504
1,8,Atlántico,83.640435
2,11,Bogotá D.C.,96.41
3,11,"Bogotá, D.C.",86.860833
4,13,Bolívar,86.463798
5,15,Boyacá,82.983827
6,17,Caldas,82.187436
7,18,Caquetá,75.105529
8,19,Cauca,79.346987
9,20,Cesar,96.113944


In [31]:
def ajuste_cod_depto(codigo):
    if len(codigo)==1:
        codigo = '0' + codigo
    return codigo

In [32]:
ajuste_cod_depto('8') ## le pone 08

'08'

Vamos a aplicar esta función a todo el conjunto de datos

In [33]:
df_raw['c_digo_departamento'] = df_raw['c_digo_departamento'].apply(ajuste_cod_depto)

In [34]:
tabla_deptos = (
    df_raw
    .query("departamento != 'NACIONAL'")
    [['c_digo_departamento', 'departamento']]
    .drop_duplicates()
    .groupby('c_digo_departamento')
    .sample(n=1, random_state=1)
    .reset_index()
)

In [35]:
tabla_deptos

Unnamed: 0,index,c_digo_departamento,departamento
0,0,5,Antioquia
1,975,8,Atlántico
2,974,11,"Bogotá, D.C."
3,928,13,Bolívar
4,805,15,Boyacá
5,778,17,Caldas
6,762,18,Caquetá
7,720,19,Cauca
8,695,20,Cesar
9,665,23,Córdoba


In [36]:
df_raw = (
    df_raw
    .query("departamento != 'NACIONAL'")
    .drop(columns='departamento')
    .merge(tabla_deptos, on='c_digo_departamento', how='left')
)
df_raw.head(4)

Unnamed: 0,a_o,c_digo_municipio,municipio,c_digo_departamento,c_digo_etc,etc,poblaci_n_5_16,tasa_matriculaci_n_5_16,cobertura_neta,cobertura_neta_transici_n,...,reprobaci_n_media,repitencia,repitencia_transici_n,repitencia_primaria,repitencia_secundaria,repitencia_media,tama_o_promedio_de_grupo,sedes_conectadas_a_internet,index,departamento
0,2023,5004,Abriaquí,5,3758,Antioquia (ETC),503,62.62,62.62,44.19,...,2.04,9.52,0.0,10.46,13.76,2.04,,,0,Antioquia
1,2023,95025,El Retorno,95,3830,Guaviare (ETC),4438,53.27,53.27,33.91,...,1.75,9.34,6.95,11.84,8.48,3.16,,,1,Guaviare
2,2023,95200,Miraflores,95,3830,Guaviare (ETC),2014,32.52,32.52,17.58,...,7.81,8.65,6.67,9.04,10.25,1.54,,,1,Guaviare
3,2023,97001,Mitú,97,3831,Vaupés (ETC),10986,59.57,59.57,42.76,...,4.6,16.18,7.75,21.04,13.84,7.18,,,3,Vaupés


In [37]:
resumen_matriculados_v0=(
    df_raw
    .query("departamento != 'NACIONAL' ")
    .groupby(['c_digo_departamento','departamento'])
    ['tasa_matriculaci_n_5_16']
    .mean()
    .reset_index()
)
resumen_matriculados_v0

Unnamed: 0,c_digo_departamento,departamento,tasa_matriculaci_n_5_16
0,5,Antioquia,87.887206
1,8,Atlántico,89.455816
2,11,"Bogotá, D.C.",87.595385
3,13,Bolívar,86.463798
4,15,Boyacá,82.983827
5,17,Caldas,82.187436
6,18,Caquetá,75.105529
7,19,Cauca,79.346987
8,20,Cesar,96.113944
9,23,Córdoba,88.003057


In [38]:
def longitud_cod_mpio(codigo):
    return len(codigo)


set(
    df_raw
    .query("departamento != 'NACIONAL' ")
    .assign(len_codigo_mpio = lambda x: x['c_digo_municipio'].apply(longitud_cod_mpio)) 
    ['len_codigo_mpio']
    .astype(str)
)

{'4', '5'}

In [39]:
def ajuste_cod_mpio(codigo):
    if len(codigo)==4:
        codigo = '0' + codigo
    return codigo

In [40]:
df_raw['c_digo_municipio'] = df_raw['c_digo_municipio'].apply(ajuste_cod_mpio)

In [41]:
df_raw

Unnamed: 0,a_o,c_digo_municipio,municipio,c_digo_departamento,c_digo_etc,etc,poblaci_n_5_16,tasa_matriculaci_n_5_16,cobertura_neta,cobertura_neta_transici_n,...,reprobaci_n_media,repitencia,repitencia_transici_n,repitencia_primaria,repitencia_secundaria,repitencia_media,tama_o_promedio_de_grupo,sedes_conectadas_a_internet,index,departamento
0,2023,05004,Abriaquí,05,3758,Antioquia (ETC),503,62.62,62.62,44.19,...,2.04,9.52,0,10.46,13.76,2.04,,,0,Antioquia
1,2023,95025,El Retorno,95,3830,Guaviare (ETC),4438,53.27,53.27,33.91,...,1.75,9.34,6.95,11.84,8.48,3.16,,,1,Guaviare
2,2023,95200,Miraflores,95,3830,Guaviare (ETC),2014,32.52,32.52,17.58,...,7.81,8.65,6.67,9.04,10.25,1.54,,,1,Guaviare
3,2023,97001,Mitú,97,3831,Vaupés (ETC),10986,59.57,59.57,42.76,...,4.6,16.18,7.75,21.04,13.84,7.18,,,3,Vaupés
4,2023,97161,Caruru,97,3831,Vaupés (ETC),1228,51.30,51.3,76.32,...,7.27,9.24,2.86,7.62,14.85,3.64,,,3,Vaupés
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14577,2011,05036,Angelópolis,05,3758,Antioquia (ETC),1707,78.85,78.9,56.2,...,7.32,0.71,0,0.7,1.08,0,19.57,100,0,Antioquia
14578,2011,05034,Andes,05,3758,Antioquia (ETC),10244,84.45,84.5,65.1,...,2.69,5.41,0.73,5.53,6.9,4.11,24.43,93.44,0,Antioquia
14579,2011,05031,Amalfi,05,3758,Antioquia (ETC),5552,97.71,97.7,61.7,...,0,,0.83,,9.93,4.47,20.01,53.45,0,Antioquia
14580,2011,05030,Amagá,05,3758,Antioquia (ETC),6631,78.65,78.7,55.3,...,7.45,0.42,0,0.24,0.91,0,25.05,83.33,0,Antioquia


Respecto a la población del municipio ¿Que porcentaje de escolaridad hay?

## Crear una bodega de datos

En esta bodega de datos la idea ers "subir" ambos conjuntos de datos y procesarlos desde SQL.

In [42]:
import sqlite3

In [43]:
conn = sqlite3.connect('../Datos/educacion_wilmer.db')

In [47]:
cursor = conn.cursor()

In [45]:
datos_2005_2019 = pd.read_excel('../Datos/DCD2005_2019.xlsx')
datos_2020_2035 = pd.read_excel('../Datos/DCD2020_2035.xlsx')

In [46]:
datos_2005_2019.to_sql("informacion_antes_pandemia",conn)
datos_2020_2035.to_sql("informacion_despues_pandemia",conn)

53859

In [48]:
consulta = """
SELECT COUNT(*) FROM informacion_antes_pandemia
"""
pd.read_sql_query(consulta, conn)

Unnamed: 0,COUNT(*)
0,50490


In [49]:
consulta = """

SELECT * FROM informacion_antes_pandemia
LIMIT 10
"""
pd.read_sql_query(consulta, conn)

Unnamed: 0,index,DP,DPNOM,DPMP,MPIO,AÑO,ÁREA GEOGRÁFICA,Población
0,0,5,Antioquia,Medellín,5001,2005,Cabecera Municipal,1997763
1,1,5,Antioquia,Medellín,5001,2005,Centros Poblados y Rural Disperso,48578
2,2,5,Antioquia,Medellín,5001,2005,Total,2046341
3,3,5,Antioquia,Abejorral,5002,2005,Cabecera Municipal,7158
4,4,5,Antioquia,Abejorral,5002,2005,Centros Poblados y Rural Disperso,15784
5,5,5,Antioquia,Abejorral,5002,2005,Total,22942
6,6,5,Antioquia,Abriaquí,5004,2005,Cabecera Municipal,723
7,7,5,Antioquia,Abriaquí,5004,2005,Centros Poblados y Rural Disperso,1996
8,8,5,Antioquia,Abriaquí,5004,2005,Total,2719
9,9,5,Antioquia,Alejandría,5021,2005,Cabecera Municipal,2137


In [51]:
consulta = """

SELECT 
CASE WHEN LENGTH(CAST(DP AS STRING)) = 1 THEN CONCAT('0',CAST(DP AS STRING))
ELSE CAST(DP AS STRING) END AS DP
FROM informacion_antes_pandemia
LIMIT 10
"""
pd.read_sql_query(consulta, conn)

Unnamed: 0,DP
0,5
1,5
2,5
3,5
4,5
5,5
6,5
7,5
8,5
9,5


In [52]:
consulta = """
SELECT
CASE WHEN LENGTH(CAST(DP AS STRING)) = 1 THEN CONCAT('0', CAST(DP AS STRING))
ELSE CAST(DP AS STRING) END AS c_digo_departamento,
CASE WHEN LENGTH(CAST(MPIO AS STRING)) = 4 THEN CONCAT('0', CAST(MPIO AS STRING))
ELSE CAST(MPIO AS STRING) END AS c_digo_municipio
FROM informacion_antes_pandemia
LIMIT 10
"""

pd.read_sql_query(consulta, conn)


Unnamed: 0,c_digo_departamento,c_digo_municipio
0,5,5001
1,5,5001
2,5,5001
3,5,5002
4,5,5002
5,5,5002
6,5,5004
7,5,5004
8,5,5004
9,5,5021


In [53]:
datos_2020_2035.head(3)

Unnamed: 0,DP,DPNOM,MPIO,DPMP,AÑO,ÁREA GEOGRÁFICA,Población
0,5,Antioquia,5001.0,Medellín,2020.0,Cabecera Municipal,2476569.0
1,5,Antioquia,5001.0,Medellín,2020.0,Centros Poblados y Rural Disperso,43023.0
2,5,Antioquia,5001.0,Medellín,2020.0,Total,2519592.0


In [None]:
consulta = """

CREATE OR REPLACE TABLE info_proyeccion_mpios 
AS
(
SELECT
CASE WHEN LENGTH(CAST(DP AS STRING)) = 1 THEN CONCAT('0', CAST(DP AS STRING))
ELSE CAST(DP AS STRING) END AS c_digo_departamento,
CASE WHEN LENGTH(CAST(MPIO AS STRING)) = 4 THEN CONCAT('0', CAST(MPIO AS STRING))
ELSE CAST(MPIO AS STRING) END AS c_digo_municipio,
AÑO AS a_o,
Población AS poblacion_total
FROM informacion_antes_pandemia
WHERE `ÁREA GEOGRÁFICA` = 'Total'

UNION ALL


SELECT
CASE WHEN LENGTH(CAST(DP AS STRING)) = 1 THEN CONCAT('0', CAST(DP AS STRING))
ELSE CAST(DP AS STRING) END AS c_digo_departamento,
CASE WHEN LENGTH(CAST(MPIO AS STRING)) = 4 THEN CONCAT('0', CAST(MPIO AS STRING))
ELSE CAST(MPIO AS STRING) END AS c_digo_municipio,
AÑO AS a_o,
Población AS poblacion_total
FROM informacion_despues_pandemia
WHERE `ÁREA GEOGRÁFICA` = 'Total'
)
"""

pd.read_sql_query(consulta, conn)

Unnamed: 0,c_digo_departamento,c_digo_municipio,a_o,poblacion_total
0,05,05001,2005.0,2046341.0
1,05,05002,2005.0,22942.0
2,05,05004,2005.0,2719.0
3,05,05021,2005.0,4724.0
4,05,05030,2005.0,27121.0
...,...,...,...,...
34777,99,99773.0,2031.0,95632.0
34778,99,99773.0,2032.0,96700.0
34779,99,99773.0,2033.0,97739.0
34780,99,99773.0,2034.0,98753.0
