# Informe de desempeño de PROESUR años 2015 - 2024

Este informe consta del desempeño del campus PROESUR desde el año 2015 hasta el año 2024. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
FOLDER = Path("data")  
OUTPUT = Path("Extracted_data")

In [5]:
def load_small_chunks(path: Path, col: str = "Cod_Estable",code: str = "05-02-0232-46") -> pd.DataFrame:
    '''
    Extracts only the data we want from each xlsx file 
    ''' 
    df = pd.read_excel(path, engine="openpyxl")
    df = df[df[col] == code]
    return df

def concat_dataframes(files: list[Path]) -> pd.DataFrame | None:
    df_list: list[pd.DataFrame] = []
    for f in files:
        df_list.append(load_small_chunks(f))
    try:
        result = pd.concat(df_list, ignore_index=True)
        return result
    except Exception as e:
        print(f"Error: {e}")
        return None
 

In [4]:
def load_small_chunks_multiple(path: Path, col: str = "Cod_Estable", code: str = "05-02-0232-46") -> pd.DataFrame:
    """Read first sheet, keep ALL columns, filter by the first alias that exists."""
    df = pd.read_excel(path, engine="openpyxl", dtype=str)  # keep as strings
    try: 
        df = df[df[col] == code]  
    except Exception:
        df = df[df[col] == code]
        return df
    else:
        return df

    

In [22]:
df_list: list[pd.DataFrame] = []


for f in sorted(FOLDER.glob("*.xlsx"), reverse=True):
    df = load_small_chunks_multiple(f)
    if not df.empty:
        df.to_csv(OUTPUT / f"{f.stem}_filtered.csv", index=False)
        df_list.append(df)

    else:
        print(f"{f.stem} was empty")


GRADUANDOS 2016 was empty
GRADUANDOS 2015 was empty
2019-Grad-Internet was empty
2018_-_Grad_Internet was empty
2017_Grad-Version_Internet was empty


In [None]:

df = load_small_chunks(FOLDER / "2018_-_Grad_Internet.xlsx")
df.head(5)

In [14]:
def search_by_value(path: Path, val:str, col:str) -> pd.DataFrame:
    df = pd.read_excel(path, engine="openpyxl")
    mask = df[col].astype(str).str.contains(val, case=False, na=False)
    return df.loc[mask]


def search_by_map(path: Path, query: dict[str, str], op: str = "AND") -> pd.DataFrame:
    """
    query: {"NOM_ESTAB": "INSTITUTO", "Municipio": "Guatemala"}
    op: "AND" or "OR"
    """
    df = pd.read_excel(path, engine="openpyxl", dtype=str)
    masks = []
    for col, val in query.items():
        if col in df.columns:
            m = df[col].astype(str).str.contains(val, case=False, na=False)
            masks.append(m)
    if not masks:
        return df.iloc[0:0]
    if op.upper() == "OR":
        mask = masks[0]
        for m in masks[1:]: mask = mask | m
    else:
        mask = masks[0]
        for m in masks[1:]: mask = mask & m
    return df.loc[mask]


In [15]:
FILES = ["2019-Grad-Internet.xlsx", "2018_-_Grad_Internet.xlsx", "2017_Grad-Version_Internet.xlsx"]
query = {"Cod_Muni": "0502", "Direc_Estable": "Km. 92.5 Finca Camantulul Carretera A Mazatenango"}
for fname in FILES:
    file_path = FOLDER / fname
    if not file_path.exists():
        print(f"⚠️  File not found: {file_path}")
        continue
    result = search_by_map(file_path, query=query, op="AND")
    result.to_csv(OUTPUT / f"{fname}_filtered.csv", index=False)
    print(f"{fname}: {len(result)} rows found")

2019-Grad-Internet.xlsx: 68 rows found
2018_-_Grad_Internet.xlsx: 53 rows found
2017_Grad-Version_Internet.xlsx: 59 rows found
