In [6]:
import pandas as pd
import os

# --- Full path to your file ---
file_path = r"D:\thesis_project\spending_data\SMPCT_2007\MPCT_FUNCION_2007_agraria.xlsx"

# --- Check if file exists ---
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")

# --- Load Excel file ---
df = pd.read_excel(file_path, skiprows=16)  # Adjust skiprows if needed

# --- Clean column names ---
df.columns = (
    df.columns.astype(str)  # make sure they're strings
      .str.strip()          # remove spaces at ends
      .str.replace("\u00a0", " ")  # replace non-breaking space
)

print("Cleaned columns:", df.columns.tolist())

# --- Find the 'Avance %' column name dynamically ---
avance_col = None
for col in df.columns:
    if "avance" in col.lower() and "%" in col:
        avance_col = col
        break

if avance_col is None:
    raise KeyError("No column found containing 'Avance' and '%'")

# --- Convert to numeric ---
df[avance_col] = pd.to_numeric(df[avance_col], errors="coerce")

# --- Filter rows ---
filtered_df = df[df[avance_col] >= 90]

# --- Show result ---
print(filtered_df)

# --- Optional: save to Excel ---
filtered_df.to_excel("filtered_avance.xlsx", index=False)
print("Filtered data saved to 'filtered_avance.xlsx'")


Cleaned columns: ['Actividad/Proyecto', 'PIA', 'PIM', 'Ejecución', 'Unnamed: 4', 'Unnamed: 5', 'Avance %']
Empty DataFrame
Columns: [Actividad/Proyecto, PIA, PIM, Ejecución, Unnamed: 4, Unnamed: 5, Avance %]
Index: []
Filtered data saved to 'filtered_avance.xlsx'
