In [1]:

import pandas as pd
import os

# Map of country codes to full names
country_map = {
    "AT": "Austria", "AU": "Australia", "BE": "Belgium", "DE": "Germany", "DK": "Denmark",
    "FI": "Finland", "IE": "Ireland", "IS": "Iceland", "IT": "Italy", "LU": "Luxembourg",
    "NL": "Netherlands", "NO": "Norway", "PT": "Portugal", "SE": "Sweden"
}

def process_ministers_excel_safe(file_path, country_code):
    try:
        xls = pd.ExcelFile(file_path)
        df_ministers = xls.parse('ministers', header=None)

        start_col = 4
        block_size = 12
        row_variable_names = 9
        row_data_start = 10
        gov_dates_row = df_ministers.iloc[1, start_col:]
        gov_blocks = [start_col + i * block_size for i in range((gov_dates_row.count()))]

        output_data = []
        for start in gov_blocks:
            block = df_ministers.iloc[row_data_start:, [1] + list(range(start, start + block_size))].copy()
            block.columns = ['ministryname_english'] + df_ministers.iloc[row_variable_names, start:start + block_size].tolist()
            block['gov_start_date'] = df_ministers.iloc[1, start]
            if 'Name' not in block.columns:
                continue
                return None
            block = block[block['Name'].notna()]
            output_data.append(block)

        df = pd.concat(output_data, axis=0).reset_index(drop=True)
        df = df.drop(columns=[col for col in ["Data Point", "Incoming Reason", "Outgoing Reason", "Notes"] if col in df.columns], errors='ignore')
        df = df[df["ministryname_english"] != "Deputy Prime Minister"]
        df['gov_start_date'] = pd.to_datetime(df['gov_start_date'], errors='coerce')
        df['Start Date'] = pd.to_datetime(df['Start Date'], errors='coerce')
        df = df[df['gov_start_date'] == df['Start Date']].copy()
        df['year'] = df['gov_start_date'].dt.year

        enhanced_rows = []
        for date, group in df.groupby('gov_start_date'):
            valid_ministers = group.dropna(subset=['Party', 'Name'])
            party_counts = valid_ministers.groupby('Party')['Name'].nunique().reset_index()
            party_counts.columns = ['partyID', 'cab_pos']
            total_positions = party_counts['cab_pos'].sum()
            party_counts['share_cab_pos'] = party_counts['cab_pos'] / total_positions
            party_counts['gov_start_date'] = date
            party_counts['year'] = date.year

            pm_party = group[group['ministryname_english'].str.contains("prime minister|chancellor", case=False, na=False)]['Party'].dropna().head(1)
            fm_party = group[group['ministryname_english'].str.contains("finance", case=False, na=False)]['Party'].dropna().head(1)
            fam_party = group[
                group['ministryname_english'].str.contains("foreign", case=False, na=False) &
                group['ministryname_english'].str.contains("affairs", case=False, na=False)
            ]['Party'].dropna().head(1)

            party_counts['PM'] = party_counts['partyID'].apply(lambda p: 1 if not pm_party.empty and p == pm_party.iloc[0] else 0)
            party_counts['FM'] = party_counts['partyID'].apply(lambda p: 1 if not fm_party.empty and p == fm_party.iloc[0] else 0)
            party_counts['FAM'] = party_counts['partyID'].apply(lambda p: 1 if not fam_party.empty and p == fam_party.iloc[0] else 0)

            party_counts['countryID'] = country_code
            party_counts['country'] = country_map.get(country_code, country_code)

            enhanced_rows.append(party_counts)

        final_df = pd.concat(enhanced_rows, ignore_index=True)
        return final_df

    except Exception as e:
        print(f"Skipping {file_path} due to error: {e}")
        return None

# File list
file_names = [
    "pdy_at.xlsx", "pdy_au.xlsx", "pdy_be.xlsx", "pdy_de.xlsx", "pdy_fi.xlsx", 
    "pdy_ie.xlsx", "pdy_is.xlsx", "pdy_it.xlsx", "pdy_lu.xlsx", "pdy_nl.xlsx",
    "pdy_no.xlsx", "pdy_pt.xlsx", "pdy_se.xlsx", "pdy_dk.xlsx"
]

data_folder = "."
uploaded_files = os.listdir(data_folder)
combined_df = pd.DataFrame()
all_columns = set()
country_frames = {}

# First pass: fix country code extraction to strip .xlsx
for file_base in file_names:
    file_code = file_base.replace("pdy_", "").replace(".xlsx", "").replace(" (1)", "").upper()
    full_path = os.path.join(data_folder, file_base)
    if file_base in uploaded_files:
        df = process_ministers_excel_safe(full_path, file_code)
        if df is not None:
            all_columns.update(df.columns)
            country_frames[file_base] = df

# Standardize and merge
all_columns = list(all_columns)
for file_base, df in country_frames.items():
    for col in all_columns:
        if col not in df.columns:
            df[col] = None
    df = df[all_columns].astype("object")
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# Save output
combined_df.to_excel("Ministry_with_labels.xlsx", index=False)

  for idx, row in parser.parse():
  for idx, row in parser.parse():
  for idx, row in parser.parse():
  for idx, row in parser.parse():
  for idx, row in parser.parse():
  for idx, row in parser.parse():
  for idx, row in parser.parse():
  for idx, row in parser.parse():
  for idx, row in parser.parse():
  for idx, row in parser.parse():
  for idx, row in parser.parse():
  for idx, row in parser.parse():
  for idx, row in parser.parse():
