# Informe de desempeño de PROESUR años 2015 - 2024

Este informe consta del desempeño del campus PROESUR desde el año 2015 hasta el año 2024. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
FOLDER = Path("data")  
OUTPUT = Path("Extracted_data")

In [3]:
def load_small_chunks(path: Path, col: str = "Cod_Estable",code: str = "05-02-0232-46") -> pd.DataFrame:
    '''
    Extracts only the data we want from each xlsx file 
    ''' 
    df = pd.read_excel(path, engine="openpyxl")
    df = df[df[col] == code]
    return df

def concat_dataframes(files: list[Path]) -> pd.DataFrame | None:
    df_list: list[pd.DataFrame] = []
    for f in files:
        df_list.append(load_small_chunks(f))
    try:
        result = pd.concat(df_list, ignore_index=True)
        return result
    except Exception as e:
        print(f"Error: {e}")
        return None
 

In [4]:
def load_small_chunks_multiple(path: Path, col: str = "Cod_Estable", code: str = "05-02-0232-46") -> pd.DataFrame:
    """Read first sheet, keep ALL columns, filter by the first alias that exists."""
    df = pd.read_excel(path, engine="openpyxl", dtype=str)  # keep as strings
    try: 
        df = df[df[col] == code]  
    except Exception:
        df = df[df[col] == code]
        return df
    else:
        return df

    

In [None]:

for f in sorted(FOLDER.glob("*.xlsx"), reverse=True):
    df = load_small_chunks_multiple(f)
    if not df.empty:
        df.to_csv(OUTPUT / f"{f.stem}_filtered.csv", index=False)
    else:
        print(f"{f.stem} was empty")


GRADUANDOS 2016 was empty
GRADUANDOS 2015 was empty
2019-Grad-Internet was empty
2018_-_Grad_Internet was empty
2017_Grad-Version_Internet was empty


In [5]:
def search_by_value(path: Path, val:str, col:str) -> pd.DataFrame:
    df = pd.read_excel(path, engine="openpyxl")
    mask = df[col].astype(str).str.contains(val, case=False, na=False)
    return df.loc[mask]


def search_by_map(path: Path, query: dict[str, str], op: str = "AND") -> pd.DataFrame:
    """
    query: {"NOM_ESTAB": "INSTITUTO", "Municipio": "Guatemala"}
    op: "AND" or "OR"
    """
    df = pd.read_excel(path, engine="openpyxl", dtype=str)
    masks = []
    for col, val in query.items():
        if col in df.columns:
            m = df[col].astype(str).str.contains(val, case=False, na=False)
            masks.append(m)
    if not masks:
        return df.iloc[0:0]
    if op.upper() == "OR":
        mask = masks[0]
        for m in masks[1:]: mask = mask | m
    else:
        mask = masks[0]
        for m in masks[1:]: mask = mask & m
    return df.loc[mask]


In [None]:
FILES = ["GRADUANDOS 2015.xlsx", "GRADUANDOS 2016.xlsx"]

for fname in FILES:
    file_path = FOLDER / fname
    if not file_path.exists():
        print(f"File not found: {file_path}")
        continue
    result = search_by_value(file_path, "INSTITUTO TECNOLOGICO PROESUR ", "NOM_ESTAB")
    result.to_csv(OUTPUT / f"{fname}_filtered.csv", index=False)
    print(f"{fname}: {len(result)} rows found")

In [None]:
FILES = ["2019-Grad-Internet.xlsx", "2018_-_Grad_Internet.xlsx", "2017_Grad-Version_Internet.xlsx"]
query = {"Cod_Muni": "0502", "Direc_Estable": "Km. 92.5 Finca Camantulul Carretera A Mazatenango"}
for fname in FILES:
    file_path = FOLDER / fname
    if not file_path.exists():
        print(f"File not found: {file_path}")
        continue
    result = search_by_map(file_path, query=query, op="AND")
    result.to_csv(OUTPUT / f"{fname}_filtered.csv", index=False)
    print(f"{fname}: {len(result)} rows found")

2019-Grad-Internet.xlsx: 68 rows found
2018_-_Grad_Internet.xlsx: 53 rows found
2017_Grad-Version_Internet.xlsx: 59 rows found


## Análisis de los datos obtenidos

In [5]:

ALIASES = {

    # academic 
    "nivel_mate": [
        "DESEMPEÑO_MATEMÁTICAS"
    ],
    "nivel_lectura": [
        "DESEMPEÑO_LECTURA"
    ],
    "logro_mate": [
        "LOGRO_MATEMÁTICAS"
    ],
    "logro_rectura": [
        "LOGRO_LECTURA"
    ],
    "periodos_semanales_mate" : [
        "MAT_PERIODOS_MATEMATICA_SEMANA_Recodificada"
    ],
    "periodos_semanales_lectura" : [
        "LEC_PERIODOS_LECTURA_SEMANA_Recodificada"
    ],
    # socioeconomic 
    "trabaja": [
       "ED_TRABAJA_ACTUALMENTE"
    ],
    "acceso_internet": [
        "CC_SERVICIO_INTERNET"
    ],
    "asistencia_primaria" : [
        "ED_AREA_DE_ESCUELA_FINALIZO_PRIMARIA"
    ],
    "sexo": [
        "GENERO", "Sexo_RECO"
    ],
    "padre_asistio_escuela" : [
        "FM_ASISTIO_ESCUELA_PAPA"
    ],
    "grado_alcanzado_padre" : [
        "FM_GRADO_ALCANZO_PAPA_Recodificada"
    ],
    "madre_asisitio_escuela" : [
        "FM_ASISTIO_ESCUELA_MAMA"
    ],
    "grado_alcanzado_madre" : [
        "FM_GRADO_ALCANZO_MAMA_Recodificada"
    ],
    "grupo_etincico" : [
        "IE_IDENTIFICACION_ETNICA_Recodificada" , "Identificacion_Etnica_RECO"
    ],
    "rep_primaria" : [
        "ED_REPITIO_ALGUN_GRADO_PRIMARIA"
    ]

}


In [3]:
def get_col(df: pd.DataFrame, key: str) -> str | None:
    """Return the actual column name for a logical key, or None if not present."""
    for name in ALIASES.get(key, []):
        if name in df.columns:
            return name
    return None

def dist_levels(df: pd.DataFrame, key: str) -> pd.Series:
    """
    Returns percentage distribution of categories in the column mapped by `key`.
    Example: key="nivel_mate" or "nivel_lectura".
    """
    col = get_col(df, key)
    if not col:
        return pd.Series(dtype=float)
    
    s = (
        df[col]
        .astype(str)
        .str.strip()
        .str.upper()
        .replace({"S/ DATO": None, "NA": None, "NAN": None})
    )
    return (s.value_counts(dropna=True, normalize=True) * 100).round(2)

def pct_true(df: pd.DataFrame, key: str) -> float | None:
    """
    Percentage of 'true' / 'yes' / 1 for the column mapped by `key`.
    Returns None if column not present or no valid data.
    """
    col = get_col(df, key)
    if not col:
        return None
    
    s = df[col].astype(str).str.strip().str.lower()
    true_vals = {"1", "si", "sí", "yes", "true", "t"}
    false_vals = {"0", "no", "false", "f"}
    
    mask_valid = s.isin(true_vals | false_vals)
    if not mask_valid.any():
        return None
    
    pct = (s.isin(true_vals) & mask_valid).mean() * 100
    return round(pct, 2)



In [7]:
CLEAN_PATH = Path("Extracted_data")
early_reports: list[str] = ["GRADUANDOS 2015.xlsx_filtered.csv", "GRADUANDOS 2016.xlsx_filtered.csv"]

exp_df = pd.read_csv(CLEAN_PATH / early_reports[0])

nivel_mat = dist_levels(exp_df, "nivel_mate")
print(nivel_mat)

DESEMPEÑO_MATEMÁTICAS
2.0    40.43
4.0    34.04
1.0    12.77
3.0    12.77
Name: proportion, dtype: float64
