# Full Cabinet Data Processing Pipeline (Uppercase CountryID)

In [None]:
import pandas as pd
import os

# Function to reshape cabinet data with Luxembourg-specific handling
def reshape_cabinet_data_flexible(file_path):
    xls = pd.ExcelFile(file_path)
    df = xls.parse("cabinetpos", header=None)

    country_code = file_path.split("/")[-1].split(".")[0].split("_")[1]
    country = country_code.capitalize()

    start_col = 3
    block_size = 5
    num_blocks = (df.shape[1] - start_col) // block_size
    reshaped_data = []

    date_row = 2 if country_code == "lu" else 1

    for block in range(num_blocks):
        base_col = start_col + block * block_size
        gov_start_date = df.iloc[date_row, base_col - 1]

        for row in range(10, df.shape[0]):
            partyID = df.iloc[row, 0]
            party_name = df.iloc[row, 1]
            alt_name = df.iloc[row, 2]
            seats = df.iloc[row, base_col]
            seats_pct = df.iloc[row, base_col + 1]
            positions = df.iloc[row, base_col + 2]
            positions_pct = df.iloc[row, base_col + 3]

            if pd.notna(partyID) and any(pd.notna(val) for val in [seats, seats_pct, positions, positions_pct]):
                parsed_date = pd.to_datetime(gov_start_date, errors="coerce")
                reshaped_data.append({
                    "country": country,
                    "countryID": country_code.upper(),
                    "gov_start_date": parsed_date,
                    "year": parsed_date.year if pd.notna(parsed_date) else None,
                    "partyID": partyID,
                    "party Name": party_name,
                    "cab_pos": positions,
                    "share_cab_pos": positions_pct,
                })

    return pd.DataFrame(reshaped_data)

# List of expected files
files = [
    "pdy_dk.xlsx", "pdy_de.xlsx", "pdy_fi.xlsx", "pdy_ie.xlsx", "pdy_is.xlsx",
    "pdy_it.xlsx", "pdy_lu.xlsx", "pdy_nl.xlsx", "pdy_no.xlsx", "pdy_pt.xlsx", "pdy_se.xlsx",
    "pdy_be.xlsx", "pdy_au.xlsx", "pdy_at.xlsx"
]

# Process and combine
all_dfs = []
for file in files:
    if os.path.exists(file):
        print(f"Processing {file}")
        df = reshape_cabinet_data_flexible(file)
        all_dfs.append(df)
    else:
        print(f"File not found: {file}")

combined_df = pd.concat(all_dfs, ignore_index=True)

# Clean and transform
combined_df = combined_df.dropna(subset=["gov_start_date"])
combined_df["country"] = combined_df["countryID"].map({
    "DK": "Denmark", "DE": "Germany", "FI": "Finland", "IE": "Ireland",
    "IS": "Iceland", "IT": "Italy", "LU": "Luxembourg", "NL": "Netherlands",
    "NO": "Norway", "PT": "Portugal", "SE": "Sweden",
    "BE": "Belgium", "AU": "Australia", "AT": "Austria"
})

# Save
combined_df.to_excel("final_cleaned_cabinet_data_uppercase.xlsx", index=False)
combined_df.head()