In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
chunk = pd.read_csv('../data/1-bronce/Consolidado-SIMAT-2017-2021.csv', chunksize = 10000)
Simat_F = pd.concat(chunk)

In [None]:
Simat_F.info(verbose=True)

In [None]:
Simat_cols = Simat_F.drop(Simat_F.columns.difference(["RES_DEPTO","RES_MUN", "ESTRATO", "SISBEN", "FECHA_NACIMIENTO", "GENERO", "POB_VICT_CONF", "DPTO_EXP", "MUN_EXP", "REPITENTE"]),1, inplace =True)

In [None]:
Simat_F[["ANO_INF", "SISBEN"]].head()

In [None]:
Simat_F["REPITENTE"].value_counts()

# RES_DEPTO


In [None]:
Simat_F["RES_DEPTO"].value_counts().reset_index()

In [None]:
count = Simat_F["RES_DEPTO"].value_counts().reset_index()
count[count["index"] != 11].sum()

**Observation**: 209.402 observations that don´t live in Bogotá. 

# RES_MUN

In [None]:
Simat_F["RES_MUN"].value_counts().reset_index()

**Observation**: We could concatenate RES_DEPTO with RES_MUN and filter the observation that have "11001" the code of Bogotá

# ESTRATO

In [None]:
Simat_F["ESTRATO"].value_counts()

**Observation**: 9897 observations have 9 as Estrato. This is erroneous, so we will replace it with NAN

In [None]:
Simat_F["ESTRATO"] = Simat_F["ESTRATO"].replace(9, None)

In [None]:
crosstab = pd.crosstab(Simat_F["ANO_INF"], Simat_F["ESTRATO"], normalize = "index")*100

In [None]:
ax = crosstab.plot(kind='bar', stacked=True, figsize=(10, 6))
ax.set_ylabel('%')
ax.set_ylabel('Year')
ax.set_title("Estrato (%) each year")
plt.legend(title='ESTRATO', bbox_to_anchor=(1.0, 1), loc='upper left')
plt.show()

# SISBEN

In [None]:
Simat_F["SISBEN"] = Simat_F["SISBEN"].str.replace(",",".").str.replace(" ", "")
Simat_F["SISBEN"] = pd.to_numeric(Simat_F["SISBEN"], errors='coerce')

In [None]:
Simat_F["SISBEN"][Simat_F["ANO_INF"] == 2017].head()

In [None]:
Simat_F["SISBEN"][Simat_F["ANO_INF"] == 2018].head()

In [None]:
Simat_F["SISBEN"][Simat_F["ANO_INF"] == 2021].head()

**Observation**: It seems that the coding of "0" was done differently in 2021. While in the other years few students have a score of 0, in 2021 almost half have this score.

**Solution**: Since in 2021 it is not possible to differentiate between an original "0" and a "0" placed as a replacement for empty values, we will just ignore the "0" in all the variables


In [None]:
Simat_F["SISBEN"][Simat_F["SISBEN"] == 0] = np.nan  

In [None]:
Sis2017 = Simat_F["SISBEN"][Simat_F["ANO_INF"] == 2017]
Sis2018 = Simat_F["SISBEN"][Simat_F["ANO_INF"] == 2018]
Sis2019 = Simat_F["SISBEN"][Simat_F["ANO_INF"] == 2019]
Sis2020 = Simat_F["SISBEN"][Simat_F["ANO_INF"] == 2020]
Sis2021 = Simat_F["SISBEN"][Simat_F["ANO_INF"] == 2021]

In [None]:
Simat_F["SISBEN"].describe()

In [None]:
sns.set(style="darkgrid")

fig, axs = plt.subplots(2, 3, figsize=(22, 15))

sns.histplot(data=Sis2017, x=Sis2017, kde=True, color="skyblue", ax=axs[0, 0])
sns.histplot(data=Sis2018, x=Sis2018, kde=True, color="olive", ax=axs[0, 1])
sns.histplot(data=Sis2019, x=Sis2019, kde=True, color="gold", ax=axs[0, 2])
sns.histplot(data=Sis2020, x=Sis2020, kde=True, color="teal", ax=axs[1, 0])
sns.histplot(data=Sis2021, x=Sis2021, kde=True, color="teal", ax=axs[1, 1])

axs[0, 0].set_title('2017')
axs[0, 1].set_title('2018')
axs[0, 2].set_title('2019')
axs[1, 0].set_title('2020')
axs[1, 1].set_title('2021')


plt.show()

In [None]:
Simat_F.head()

In [None]:
SISBEN["edad"].value_counts(SISBEN["edad"].value_counts())