In [2]:
import os
import glob
import pandas as pd

def detect_header_rows(filepath, sheet_name, anchor=None, keywords=None, lookahead=15):
    """
    Detects header rows in an Excel sheet using either:
    - an exact anchor string (preferred), or
    - a list of keywords (fallback).
    """
    preview = pd.read_excel(filepath, sheet_name=sheet_name, nrows=lookahead, header=None)
    target_row = None

    for i, row in preview.iterrows():
        for cell in row:
            if pd.isna(cell):
                continue
            cell_str = str(cell).lower()

            if anchor and anchor.lower() in cell_str:
                target_row = i
                break

            if keywords and any(k.lower() in cell_str for k in keywords):
                target_row = i
                break

        if target_row is not None:
            break

    if target_row is None:
        raise ValueError("No header row detected with provided anchor or keywords.")

    return list(range(target_row, target_row + 2))  # assume 2-row header


def read_excel_with_detected_header(filepath, sheet_name, anchor=None, keywords=None, flatten=True):
    """
    Reads Excel with auto-detected header rows using an anchor or keywords.
    Cleans multi-index headers if needed.
    """
    header_rows = detect_header_rows(filepath, sheet_name, anchor=anchor, keywords=keywords)
    df = pd.read_excel(filepath, sheet_name=sheet_name, header=header_rows)

    if flatten and isinstance(df.columns, pd.MultiIndex):
        def clean_column(col):
            parts = [str(part).strip() for part in col if pd.notna(part)]
            keep_parts = [p for p in parts if any(unit in p for unit in 
                          ['SINK CATEGORIES', 'CO2', 'CH4', 'N2O', 'SF6', 
                           'HFC', 'PFC', '(kt)', 'NF', 'NO', 'NMVOC', 'CO', 'SO'])]
            return ' '.join(keep_parts).strip()

        df.columns = [clean_column(col) for col in df.columns]

    return df

def extract_year_from_filename(filename):
    """
    Extracts year from filename following pattern like 'GBR-CRT-2025-V0.6-1990-20250415-091720.xlsx'
    Returns: (country_code, year)
    """
    # Remove the file extension and split by '-'
    basename = os.path.splitext(filename)[0]  
    parts = basename.split('-')
    try:
        if len(parts) >= 5:
            country_code = parts[0]
            year = int(parts[4])  # This is the "1990" year
            return country_code, year
        else:
            raise ValueError("Filename format is incorrect")
    except (IndexError, ValueError) as e:
        print(f"Filename parsing error: {filename} -> {e}")
        return None, None
 
    
def normalise_category(category):
    """Normalise category names by removing extra spaces or footnotes
    MAINLY FOR AUTSRALIA"""
    if isinstance(category, str):
        #  
        category = ' '.join(category.split())
        # Remove footnote numbers by splitting on '(' and taking first part
        category = category.split('(')[0].strip()
    return category


In [174]:
filepath = "data/crt/United Kingdom/GBR-CRT-2025-V0.6-1990-20250415-091720_started.xlsx"
daf = read_excel_with_detected_header(filepath, 'Summary2', anchor="GREENHOUSE GAS SOURCE", flatten=True)
daf

Unnamed: 0,Unnamed: 1,GREENHOUSE GAS SOURCE AND SINK CATEGORIES,CO2 (1) CO2 equivalents (kt ) (2),CH4,N2O,HFCs,PFCs,Unspecified mix of HFCs and PFCs,SF6,NF3,Unnamed: 11
0,,Total (net emissions) (1),606725.197571,149040.613915,44262.632074,12061.124964,1483.661309,"NA,NO",1242.919831,0.109854,814816.259517
1,,1. Energy,557802.468901,41864.604382,3171.741143,,,,,,602838.814426
2,,1.A. Fuel combustion,551015.399292,3596.329509,3131.922545,,,,,,557743.651346
3,,1.A.1. Energy industries,238063.887901,244.717063,1277.606964,,,,,,239586.211929
4,,1.A.2. Manufacturing industries and construction,72657.489994,96.833024,222.3663,,,,,,72976.689317
...,...,...,...,...,...,...,...,...,...,...,...
66,,"(2) As per decision 18/CMA.1, annex, para. 37,...",,,,,,,,,
67,,(3) Parties are asked to report emissions fro...,,,,,,,,,
68,,"(4) In accordance with the modalities, proced...",,,,,,,,,
69,,,,,,,,,,,


In [None]:
memo_items = df[category_col].astype(str).str.startswith('Memo items: (8)')
memo_items_df = df[memo_items]

df.drop(memo_items_df)

    # Process memo items
    if memo_mask.any():
        memo_df = df[memo_mask].copy()
        memo_df['Level'] = 'Memo'

In [3]:
def get_category(category_str):
    category_str = str(category_str).strip()

    # Special case for total and memo items header
    if category_str.startswith('Total'):
        return {
            'sector': None,
            'subsector': None,
            'sub_subsector': None,
            'sub_sub_subsector': None,
            'label': category_str,
            'is_memo': False
        }
    
    # Check if this is a memo item
    is_memo = (category_str.startswith('Memo items:') or 
               category_str.startswith('1.D.') or 
               category_str.startswith('5.F.') or 
               category_str.startswith('Indirect N2O') or 
               category_str.startswith('Indirect CO2'))

    try:
        # Try to split on space, if that fails, use the whole string
        split_parts = category_str.split(" ", 1)
        if len(split_parts) == 2:
            level, label = split_parts
        else:
            # If no space found, try to find where numbers/dots end
            for i, character in enumerate(category_str):
                if not (character.isdigit() or character == '.' or character == 'A'):
                    level = category_str[:i].strip()
                    label = category_str[i:].strip()
                    break
            else:
                level = category_str
                label = category_str
    
        parts = level.strip('.').split(".")
        sector = parts[0] if len(parts) > 0 else None
        subsector = parts[1] if len(parts) > 1 else None
        sub_subsector = parts[2] if len(parts) > 2 else None
        sub_sub_subsector = parts[3] if len(parts) > 3 else None

        return {
            'sector': sector,
            'subsector': subsector,
            'sub_subsector': sub_subsector,
            'sub_sub_subsector': sub_sub_subsector,
            'label': label.strip(),
            'is_memo': is_memo
        }
    except Exception as e:
        # If anything goes wrong, return a default structure
        return {
            'sector': None,
            'subsector': None,
            'sub_subsector': None,
            'sub_sub_subsector': None,
            'label': category_str,
            'is_memo': is_memo
        }



In [4]:
def save_gas_level_csv(df, gas_col, output_path, index_col='Year', column_col='Label', value_col=None):
    """
    Converts the dataframe into year x sector format and saves to CSV.
    """
    if value_col is None:
        value_col = gas_col

    if gas_col not in df.columns:
        print(f"Skipping {gas_col} — column not found.")
        return

    # Get the gas type from the gas_col
    gas_type = gas_col.split()[0].lower()  # e.g., 'co2', 'ch4', etc.
    
    # Modify the output path to include the gas type in the filename
    output_dir = os.path.dirname(output_path)
    filename = os.path.basename(output_path)
    filename_without_ext = os.path.splitext(filename)[0]
    new_filename = f"{filename_without_ext}_{gas_type}.parquer"
    new_output_path = os.path.join(output_dir, new_filename)

    pivot_df = df[[index_col, column_col, gas_col]].pivot_table(
        index=index_col, columns=column_col, values=value_col, aggfunc='first'
    ).sort_index()

    pivot_df.to_parquet(new_output_path)

In [5]:
def save_gas_level_parquet(df, gas_col, output_path, index_col='Year', column_col='Label', value_col=None):
    """
    Converts the dataframe into year x sector format and saves to Parquet.
    """
    if value_col is None:
        value_col = gas_col

    if gas_col not in df.columns:
        print(f"Skipping {gas_col} — column not found.")
        return

    # FIXED: Handle Unicode subscripts properly
    gas_name = gas_col.split()[0].lower()
    
    # Convert Unicode subscripts back to regular characters for file naming
    gas_type_mapping = {
        'co₂': 'co2',
        'ch₄': 'ch4', 
        'n₂o': 'n2o',
        'sf₆': 'sf6',
        'nf₃': 'nf3'
    }
    
    gas_type = gas_type_mapping.get(gas_name, gas_name)  # Use mapping or original if not found
    
    # Modify the output path to include the gas type in the filename
    output_dir = os.path.dirname(output_path)
    filename = os.path.basename(output_path)
    filename_without_ext = os.path.splitext(filename)[0]
    new_filename = f"{filename_without_ext}_{gas_type}.parquet"
    new_output_path = os.path.join(output_dir, new_filename)

    # Ensure directory exists
    os.makedirs(output_dir, exist_ok=True)

    pivot_df = df[[index_col, column_col, gas_col]].pivot_table(
        index=index_col, columns=column_col, values=value_col, aggfunc='first'
    ).sort_index()

    pivot_df.to_parquet(new_output_path)
    print(f"Saved {gas_type} data to {new_output_path}")


In [6]:
def process_hierarchical_data(df):
    category_col = None
    for col in df.columns:
        if 'GREENHOUSE GAS SOURCE AND SINK CATEGORIES' in col:
            category_col = col
            break
    if not category_col:
        raise ValueError("Category column not found")

    sectors, subsectors, sub_subsectors, labels, levels, is_memo = [], [], [], [],[],[]
    
    memo_start_idx = None
    for idx, value in enumerate(df[category_col]):
        if 'Memo items:' in str(value):
            memo_start_idx = idx
            break
    
    for cat_str in df[category_col].astype(str):
        categories = get_category(cat_str)

        sectors.append(categories['sector'])
        subsectors.append(categories['subsector'])
        sub_subsectors.append(categories['sub_subsector'])
        labels.append(categories['label'])
        is_memo.append(categories['is_memo'])

        if categories['label'].startswith('Total'):
            levels.append('Total')
        elif categories['sector'] and not categories['subsector']:
            levels.append('Sector')
        elif categories['subsector'] and not categories['sub_subsector']:
            levels.append('Subsector')
        elif categories['sub_subsector']:
            levels.append('Sub-subsector')
        else:
            levels.append('Unknown')

    df = df.copy()
    df['Sector'] = sectors
    df['Subsector'] = subsectors
    df['Sub_subsector'] = sub_subsectors
    df['Label'] = labels
    df['Level'] = levels
    df['Is_Memo'] = is_memo

    main_df = df[~df['Is_Memo']]
    memo_df = df[df['Is_Memo']]
                    
    total_df = main_df[main_df['Level'] == 'Total']
    sector_df = main_df[main_df['Level'] == 'Sector']
    subsector_df = main_df[main_df['Level'] == 'Subsector']
    sub_subsector_df = main_df[main_df['Level'] == 'Sub-subsector']

    # Processing Summary
    print(f"\nProcessing Summary:")
    print(f"Found {0 if total_df.empty else len(total_df)} total rows")
    print(f"Found {0 if sector_df.empty else len(sector_df)} sector rows")
    print(f"Found {0 if subsector_df.empty else len(subsector_df)} subsector rows")
    print(f"Found {0 if sub_subsector_df.empty else len(sub_subsector_df)} sub-subsector rows")
    print(f"Found {0 if memo_df.empty else len(memo_df)} memo items")

    # Detailed breakdown
    print("\nDetailed breakdown:")
    
    if not total_df.empty:
        print("\nTotal categories:")
        print(total_df[category_col].tolist())
    
    if not sector_df.empty:
        print("\nSectors:")
        print(sector_df[category_col].tolist())
    
    if not subsector_df.empty:
        # Group subsectors by their parent sector
        for sector_num in subsector_df['Sector'].unique():
            sector_subsectors = subsector_df[subsector_df['Sector'] == sector_num]
            if not sector_subsectors.empty:
                print(f"\nSubsectors for Sector {sector_num}:")
                print(sector_subsectors[category_col].tolist())
    
    if not sub_subsector_df.empty:
        # Group sub-subsectors by their parent sector and subsector
        for sector_num in sub_subsector_df['Sector'].unique():
            for subsector_code in sub_subsector_df[sub_subsector_df['Sector'] == sector_num]['Subsector'].unique():
                detailed = sub_subsector_df[
                    (sub_subsector_df['Sector'] == sector_num) & 
                    (sub_subsector_df['Subsector'] == subsector_code)
                ]
                if not detailed.empty:
                    print(f"\nDetailed breakdowns for {sector_num}.{subsector_code}:")
                    print(detailed[category_col].tolist())

    if not memo_df.empty:
        print("\nMemo Items:")
        print(memo_df[category_col].tolist())

    return total_df, sector_df, subsector_df, sub_subsector_df, memo_df



In [16]:
'x\u2082'
co2 = 'CO\u2082'
co2

'CO₂'

In [None]:
def process_summary_sheet(sheet_name, folder_path, output_folder):
    """
    Process a specific summary sheet from all Excel files in the folder
    """
    print(f"Processing {sheet_name}...")
    country_name = os.path.basename(folder_path).lower()

    # Define gas types and their folders
    GAS_STANDARD_NAMES = {
        'CO2 (1) CO2 equivalents (kt ) (2)': 'CO\u2082 (kt)',
        'CH4': 'CH\u2084 (kt)',
        'N2O': 'N\u2082O (kt)',
        'HFCs': 'HFCs (kt)',
        'PFCs': 'PFCs (kt)',
        'Unspecified mix of HFCs and PFCs': 'HFC+PFC Mix (kt)',
        'SF6': 'SF\u2086 (kt)',
        'NF3': 'NF\u2083 (kt)',
    }

    # Create base directories for each gas type and level
    levels = ['total', 'sectors', 'subsectors', 'sub_subsectors', 'memo_items']
    for gas in GAS_STANDARD_NAMES.values():
        gas_type = gas.split()[0].lower()
        for level in levels:
            os.makedirs(os.path.join(output_folder, country_name, gas_type, level), exist_ok=True)

    for filepath in glob.glob(os.path.join(folder_path, "*.xlsx")):
        try:
            # Read the specific summary sheet
            df = read_excel_with_detected_header(filepath, sheet_name, anchor='GREENHOUSE GAS SOURCE AND', flatten=True)

            new_columns = []
            for col in df.columns:
                col_clean = col.strip()
                if col_clean in GAS_STANDARD_NAMES:
                    new_columns.append(GAS_STANDARD_NAMES[col_clean])
                else:
                    new_columns.append(col)
            df.columns = new_columns


            # Find the category column
            category_col = None
            for col in df.columns:
                if 'GREENHOUSE GAS SOURCE AND SINK CATEGORIES' in col:
                    category_col = col
                    break
            
            if not category_col:
                raise ValueError("Category column not found")

            # Keep the category column
            categories = df[category_col]

            # Convert other columns to numeric, coercing errors to NaN
            numeric_df = df.drop(columns=[category_col]).apply(pd.to_numeric, errors='coerce')

            # Recombine the category column back
            df = pd.concat([categories, numeric_df], axis=1)

            # Drop rows that are completely empty but keep the category column
            df = df.dropna(axis=0, how='all', subset=numeric_df.columns)
            
            # Drop columns that are completely empty
            df = df.dropna(axis=1, how='all')

            # Define years and GHG keywords
            years_table10s6 = [str(year) for year in range(1990, 2023+1)]
            ghg_keywords = ['CO2', 'CH4', 'N2O', 'SF6', 'HFC', 'PFC',
                          'Base year (1)', 'Change from base to latest'] + years_table10s6

            # Keep category column containing any GHG keywords
            cols_to_keep = [category_col] + [col for col in df.columns 
                                           if any(k in col for k in ghg_keywords)]

            df = df[cols_to_keep]
            
            # Extract year
            country_code, year = extract_year_from_filename(os.path.basename(filepath))
            if not country_code or not year:
                print(f"Skipping {filepath}: Could not extract country code or year")
                continue
            
            # Add sheet name to DataFrame
            df['Sheet'] = sheet_name

            # Create directories using country name from folder
            country_output = os.path.join(output_folder, country_name)
            os.makedirs(os.path.join(country_output, 'total'), exist_ok=True)
            os.makedirs(os.path.join(country_output, 'sectors'), exist_ok=True)
            os.makedirs(os.path.join(country_output, 'subsectors'), exist_ok=True)
            os.makedirs(os.path.join(country_output, 'sub_subsectors'), exist_ok=True)
            os.makedirs(os.path.join(country_output, 'memo_items'), exist_ok=True)

            df['Country'] = country_name.upper()
            df['Year'] = year
            
            # Process into hierarchical structure
            total_df, sector_df, subsector_df, sub_subsector_df, memo_df = process_hierarchical_data(df)
             # Create filename with sheet name and year
            base_filename = f"{country_code.lower()}_{sheet_name}_{year}"
            
             # Save files for each gas type separately
            for original_name, standardized_gas in GAS_STANDARD_NAMES.items():
                gas_type = standardized_gas.split()[0].lower()  # e.g., 'co2', 'ch4', etc.
                
                # Define paths for each level
                level_paths = {
                    'total': os.path.join(output_folder, country_name, gas_type, 'total', f"{base_filename}_total.parquet"),
                    'sectors': os.path.join(output_folder, country_name, gas_type, 'sectors', f"{base_filename}_sectors.parquet"),
                    'subsectors': os.path.join(output_folder, country_name, gas_type, 'subsectors', f"{base_filename}_subsectors.parquet"),
                    'sub_subsectors': os.path.join(output_folder, country_name, gas_type, 'sub_subsectors', f"{base_filename}_sub_subsectors.parquet"),
                    'memo_items': os.path.join(output_folder, country_name, gas_type, 'memo_items', f"{base_filename}_memo_items.parquet")
                }

                # Save each level for this gas type
                save_gas_level_csv(total_df, standardized_gas, level_paths['total'])
                save_gas_level_csv(sector_df, standardized_gas, level_paths['sectors'])
                save_gas_level_csv(subsector_df, standardized_gas, level_paths['subsectors'])
                save_gas_level_csv(sub_subsector_df, standardized_gas, level_paths['sub_subsectors'])
                save_gas_level_csv(memo_df, standardized_gas, level_paths['memo_items'])

            print(f"Processed {sheet_name} for year {year}")
            
        except Exception as e:
            print(f"Error processing {sheet_name} in {filepath}: {e}")


            for standardized_gas in GAS_STANDARD_NAMES.values():
                gas_folder = os.path.join(country_output, standardized_gas.split()[0].lower())
                os.makedirs(gas_folder, exist_ok=True)

                save_gas_level_csv(total_df, standardized_gas, os.path.join(gas_folder, f"{base_filename}_total.parquet"))
                save_gas_level_csv(sector_df, standardized_gas, os.path.join(gas_folder, f"{base_filename}_sectors.parquet"))
                save_gas_level_csv(subsector_df, standardized_gas, os.path.join(gas_folder, f"{base_filename}_subsectors.parquet"))
                save_gas_level_csv(sub_subsector_df, standardized_gas, os.path.join(gas_folder, f"{base_filename}_sub_subsectors.parquet"))
                save_gas_level_csv(memo_df, standardized_gas, os.path.join(gas_folder, f"{base_filename}_memo_items.parquet"))
    
    # After processing all files, combine years for each gas type and level
    for gas in GAS_STANDARD_NAMES.values():
        gas_type = gas.split()[0].lower()
        for level in levels:
            level_path = os.path.join(output_folder, country_name, gas_type, level)
            csv_files = glob.glob(os.path.join(level_path, f"*_{gas_type}.parquet"))
            
            if csv_files:
                # Combine all years for this gas type and level
                combined_df = pd.concat([pd.read_csv(f) for f in csv_files])
                combined_path = os.path.join(level_path, f"{country_name}_{level}_{gas_type}_combined.parquety")
                combined_df.to_parquet(combined_path, index=False)
                
                # Optionally remove individual year files
                for f in csv_files:
                    os.remove(f)

    print(f"Completed processing {sheet_name} for {country_name}")


In [None]:
###both versions made here --nonsubsript
def process_summary_sheet(sheet_name, folder_path, output_folder):
    """
    Process a specific summary sheet from all Excel files in the folder
    """
    print(f"Processing {sheet_name}...")
    country_name = os.path.basename(folder_path)

    # Define gas types and their folders
    GAS_STANDARD_NAMES = {
        'CO2 (1) CO2 equivalents (kt ) (2)': 'CO2 (kt)',
        'CH4': 'CH4 (kt)',
        'N2O': 'N2O (kt)',
        'HFCs': 'HFCs (kt)',
        'PFCs': 'PFCs (kt)',
        'Unspecified mix of HFCs and PFCs': 'HFC+PFC Mix (kt)',
        'SF6': 'SF6 (kt)',
        'NF3': 'NF3 (kt)',
    }
    # Create base directories for both approaches
    levels = ['total', 'sectors', 'subsectors', 'sub_subsectors', 'memo_items']
    
    # Create directories for combined approach
    country_output = os.path.join(output_folder, country_name)
    for level in levels:
        os.makedirs(os.path.join(country_output, level), exist_ok=True)

    # Create directories for species-specific approach
    for gas in GAS_STANDARD_NAMES.values():
        gas_type = gas.split()[0].lower()
        for level in levels:
            os.makedirs(os.path.join(output_folder, country_name, gas_type, level), exist_ok=True)
    
    for filepath in glob.glob(os.path.join(folder_path, "*.xlsx")):
        try:
            # Read the specific summary sheet
            df = read_excel_with_detected_header(filepath, sheet_name, anchor='GREENHOUSE GAS SOURCE AND', flatten=True)

            new_columns = []
            for col in df.columns:
                col_clean = col.strip()
                if col_clean in GAS_STANDARD_NAMES:
                    new_columns.append(GAS_STANDARD_NAMES[col_clean])
                else:
                    new_columns.append(col)
            df.columns = new_columns


            # Find the category column
            category_col = None
            for col in df.columns:
                if 'GREENHOUSE GAS SOURCE AND SINK CATEGORIES' in col:
                    category_col = col
                    break
            
            if not category_col:
                raise ValueError("Category column not found")

            # Keep the category column
            categories = df[category_col]

            # Convert other columns to numeric, coercing errors to NaN
            numeric_df = df.drop(columns=[category_col]).apply(pd.to_numeric, errors='coerce')

            # Recombine the category column back
            df = pd.concat([categories, numeric_df], axis=1)

            # Drop rows that are completely empty but keep the category column
            df = df.dropna(axis=0, how='all', subset=numeric_df.columns)
            
            # Drop columns that are completely empty
            df = df.dropna(axis=1, how='all')

            # Define years and GHG keywords
            years_table10s6 = [str(year) for year in range(1990, 2023+1)]
            ghg_keywords = ['CO2', 'CH4', 'N2O', 'SF6', 'HFC', 'PFC',
                          'Base year (1)', 'Change from base to latest'] + years_table10s6

            # Keep category column containing any GHG keywords
            cols_to_keep = [category_col] + [col for col in df.columns 
                                           if any(k in col for k in ghg_keywords)]

            df = df[cols_to_keep]
            
            # Extract year
            country_code, year = extract_year_from_filename(os.path.basename(filepath))
            if not country_code or not year:
                print(f"Skipping {filepath}: Could not extract country code or year")
                continue
            
            # Add sheet name to DataFrame
            df['Sheet'] = sheet_name

            # Create directories using country name from folder
            country_output = os.path.join(output_folder, country_name)
            os.makedirs(os.path.join(country_output, 'total'), exist_ok=True)
            os.makedirs(os.path.join(country_output, 'sectors'), exist_ok=True)
            os.makedirs(os.path.join(country_output, 'subsectors'), exist_ok=True)
            os.makedirs(os.path.join(country_output, 'sub_subsectors'), exist_ok=True)
            os.makedirs(os.path.join(country_output, 'memo_items'), exist_ok=True)

            df['Country'] = country_name.upper()
            df['Year'] = year
            
            # Process into hierarchical structure
            total_df, sector_df, subsector_df, sub_subsector_df, memo_df = process_hierarchical_data(df)
             # Create filename with sheet name and year
            base_filename = f"{country_code.lower()}_{sheet_name}_{year}"

            #option: Save combined files (original)
            level_dfs = {
                'total': total_df,
                'sectors': sector_df,
                'subsectors': subsector_df,
                'sub_subsectors': sub_subsector_df,
                'memo_items': memo_df
            }

            for level, df_to_save in level_dfs.items():
                if not df_to_save.empty:
                    output_path = os.path.join(
                        country_output,
                        level,
                        f"{base_filename}_{level}.parquet"
                    )
                    df_to_save.to_parquet(output_path, index=False)

            # option 2: Save species-specific files
            for original_name, standardized_gas in GAS_STANDARD_NAMES.items():
                gas_type = standardized_gas.split()[0].lower()
                
                # Define paths for each level
                level_paths = {
                    'total': os.path.join(output_folder, country_name, gas_type, 'total', f"{base_filename}_total.parquet"),
                    'sectors': os.path.join(output_folder, country_name, gas_type, 'sectors', f"{base_filename}_sectors.parquet"),
                    'subsectors': os.path.join(output_folder, country_name, gas_type, 'subsectors', f"{base_filename}_subsectors.parquet"),
                    'sub_subsectors': os.path.join(output_folder, country_name, gas_type, 'sub_subsectors', f"{base_filename}_sub_subsectors.parquet"),
                    'memo_items': os.path.join(output_folder, country_name, gas_type, 'memo_items', f"{base_filename}_memo_items.parquet")
                }

                # Save each level for this gas type
                save_gas_level_csv(total_df, standardized_gas, level_paths['total'])
                save_gas_level_csv(sector_df, standardized_gas, level_paths['sectors'])
                save_gas_level_csv(subsector_df, standardized_gas, level_paths['subsectors'])
                save_gas_level_csv(sub_subsector_df, standardized_gas, level_paths['sub_subsectors'])
                save_gas_level_csv(memo_df, standardized_gas, level_paths['memo_items'])

            print(f"Processed {sheet_name} for year {year}")
            
        except Exception as e:
            print(f"Error processing {sheet_name} in {filepath}: {e}")

    # After processing all files, combine years for both approaches
    
    # Combine files for original 
    for level in levels:
        level_path = os.path.join(country_output, level)
        csv_files = glob.glob(os.path.join(level_path, "*.parquet"))
        if csv_files:
            combined_df = pd.concat([pd.read_parquet(f) for f in csv_files])
            combined_path = os.path.join(level_path, f"{country_name}_{level}_combined.parquet")
            combined_df.to_parquet(combined_path, index=False)
            
            # Optionally remove individual year files
            for f in csv_files:
                os.remove(f)

    # Combine files for species-specific 
    for gas in GAS_STANDARD_NAMES.values():
        gas_type = gas.split()[0].lower()
        for level in levels:
            level_path = os.path.join(output_folder, country_name, gas_type, level)
            csv_files = glob.glob(os.path.join(level_path, f"*_{gas_type}.parquet"))
            
            if csv_files:
                combined_df = pd.concat([pd.read_csv(f) for f in csv_files])
                combined_path = os.path.join(level_path, f"{country_name}_{level}_{gas_type}_combined.parquet")
                combined_df.to_parquet(combined_path, index=False)
                
                # Optionally remove individual year files
                for f in csv_files:
                    os.remove(f)

    print(f"Completed processing {sheet_name} for {country_name}")


In [11]:
def process_summary_sheet_subs(sheet_name, folder_path, output_folder):
    """
    Process a specific summary sheet from all Excel files in the folder
    """
    print(f"Processing {sheet_name}...")
    
    country_name = os.path.basename(folder_path)

    # Define gas types with Unicode subscripts for better display
    GAS_STANDARD_NAMES = {
        'CO2 (1) CO2 equivalents (kt ) (2)': 'CO\u2082 (kt)',
        'CO2': 'CO\u2082 (kt)',
        'CH4': 'CH\u2084 (kt)',
        'N2O': 'N\u2082O (kt)',
        'HFCs': 'HFCs (kt)',
        'PFCs': 'PFCs (kt)',
        'Unspecified mix of HFCs and PFCs': 'HFC+PFC Mix (kt)',
        'SF6': 'SF\u2086 (kt)',
        'NF3': 'NF\u2083 (kt)',
    }

    levels = ['total', 'sectors', 'subsectors', 'sub_subsectors', 'memo_items']
    
    # ONLY create main country directories (no gas-specific subdirectories)
    country_output = os.path.join(output_folder, country_name)
    for level in levels:
        os.makedirs(os.path.join(country_output, level), exist_ok=True)
    
    for filepath in glob.glob(os.path.join(folder_path, "*.xlsx")):
        try:
            # Read and process data
            df = read_excel_with_detected_header(filepath, sheet_name, anchor='GREENHOUSE GAS SOURCE AND', flatten=True)

            # Apply Unicode subscript column names
            new_columns = []
            for col in df.columns:
                col_clean = col.strip()
                if col_clean in GAS_STANDARD_NAMES:
                    new_columns.append(GAS_STANDARD_NAMES[col_clean])
                else:
                    new_columns.append(col)
            df.columns = new_columns

            # Find the category column
            category_col = None
            for col in df.columns:
                if 'GREENHOUSE GAS SOURCE AND SINK CATEGORIES' in col:
                    category_col = col
                    break
            
            if not category_col:
                raise ValueError("Category column not found")

            # Keep the category column
            categories = df[category_col]

            # Convert other columns to numeric, coercing errors to NaN
            numeric_df = df.drop(columns=[category_col]).apply(pd.to_numeric, errors='coerce')

            # Recombine the category column back
            df = pd.concat([categories, numeric_df], axis=1)

            # Drop rows that are completely empty but keep the category column
            df = df.dropna(axis=0, how='all', subset=numeric_df.columns)
            
            # Drop columns that are completely empty
            df = df.dropna(axis=1, how='all')

            # FIXED: Define years and GHG keywords using the NEW COLUMN NAMES (with subscripts)
            years_table10s6 = [str(year) for year in range(1990, 2023+1)]
            ghg_keywords = ['CO\u2082', 'CH\u2084', 'N\u2082O', 'SF\u2086', 'HFC', 'PFC', 'NF\u2083',
                          'Base year (1)', 'Change from base to latest'] + years_table10s6

            # Keep category column containing any GHG keywords
            cols_to_keep = [category_col] + [col for col in df.columns 
                                           if any(k in col for k in ghg_keywords)]

            df = df[cols_to_keep]
            
            # Extract year
            country_code, year = extract_year_from_filename(os.path.basename(filepath))
            if not country_code or not year:
                print(f"Skipping {filepath}: Could not extract country code or year")
                continue
            
            # Add sheet name to DataFrame
            df['Sheet'] = sheet_name
            df['Country'] = country_name.upper()
            df['Year'] = year
            
            # Process into hierarchical structure
            total_df, sector_df, subsector_df, sub_subsector_df, memo_df = process_hierarchical_data(df)
            
            # Create filename with sheet name and year
            base_filename = f"{country_code.lower()}_{sheet_name}_{year}"

            # Option 1: Save combined files as Parquet
            level_dfs = {
                'total': total_df,
                'sectors': sector_df,
                'subsectors': subsector_df,
                'sub_subsectors': sub_subsector_df,
                'memo_items': memo_df
            }

            for level, df_to_save in level_dfs.items():
                if not df_to_save.empty:
                    output_path = os.path.join(
                        country_output,
                        level,
                        f"{base_filename}_{level}.parquet"
                    )
                    df_to_save.to_parquet(output_path, index=False)

            # Option 2: Save species-specific files as Parquet
            for original_name, standardized_gas in GAS_STANDARD_NAMES.items():
                gas_type = standardized_gas.split()[0].lower()
                
                # Define paths for each level
                level_paths = {
                    'total': os.path.join(output_folder, country_name, gas_type, 'total', f"{base_filename}_total.parquet"),
                    'sectors': os.path.join(output_folder, country_name, gas_type, 'sectors', f"{base_filename}_sectors.parquet"),
                    'subsectors': os.path.join(output_folder, country_name, gas_type, 'subsectors', f"{base_filename}_subsectors.parquet"),
                    'sub_subsectors': os.path.join(output_folder, country_name, gas_type, 'sub_subsectors', f"{base_filename}_sub_subsectors.parquet"),
                    'memo_items': os.path.join(output_folder, country_name, gas_type, 'memo_items', f"{base_filename}_memo_items.parquet")
                }

                # Save each level for this gas type
                save_gas_level_parquet(total_df, standardized_gas, level_paths['total'])
                save_gas_level_parquet(sector_df, standardized_gas, level_paths['sectors'])
                save_gas_level_parquet(subsector_df, standardized_gas, level_paths['subsectors'])
                save_gas_level_parquet(sub_subsector_df, standardized_gas, level_paths['sub_subsectors'])
                save_gas_level_parquet(memo_df, standardized_gas, level_paths['memo_items'])

            print(f"Processed {sheet_name} for year {year}")
            
        except Exception as e:
            print(f"Error processing {sheet_name} in {filepath}: {e}")

    # After processing all files, combine years for both approaches
    
    # Combine files for original approach
    for level in levels:
        level_path = os.path.join(country_output, level)
        parquet_files = glob.glob(os.path.join(level_path, "*.parquet"))
        if parquet_files:
            combined_df = pd.concat([pd.read_parquet(f) for f in parquet_files])
            combined_path = os.path.join(level_path, f"{country_name}_{level}_combined.parquet")
            combined_df.to_parquet(combined_path, index=False)
            
            # Optionally remove individual year files
            for f in parquet_files:
                os.remove(f)

    # Combine files for species-specific approach
    for gas in GAS_STANDARD_NAMES.values():
        gas_type = gas.split()[0].lower()
        for level in levels:
            level_path = os.path.join(output_folder, country_name, gas_type, level)
            parquet_files = glob.glob(os.path.join(level_path, f"*_{gas_type}.parquet"))
            
            if parquet_files:
                combined_df = pd.concat([pd.read_parquet(f) for f in parquet_files])
                combined_path = os.path.join(level_path, f"{country_name}_{level}_{gas_type}_combined.parquet")
                combined_df.to_parquet(combined_path, index=False)
                
                # Optionally remove individual year files
                for f in parquet_files:
                    os.remove(f)

    print(f"Completed processing {sheet_name} for {country_name}")


In [51]:
# Main execution
folder_path = "data/crt/Spain"
output_folder = "data/processed_data"
summary1_as1 = process_summary_sheet_subs("Summary2", folder_path, output_folder)

Processing Summary2...

Processing Summary:
Found 1 total rows
Found 5 sector rows
Found 28 subsector rows
Found 7 sub-subsector rows
Found 4 memo items

Detailed breakdown:

Total categories:
['Total (net emissions) (1)']

Sectors:
['1. Energy', '2.  Industrial processes and product use', '3.  Agriculture', '4. Land use, land-use change and forestry (1)', '5.  Waste']

Subsectors for Sector 1:
['1.A. Fuel combustion', '1.B. Fugitive emissions from fuels']

Subsectors for Sector 2:
['2.A.  Mineral industry', '2.B.  Chemical industry', '2.C.  Metal industry', '2.D.  Non-energy products from fuels and solvent use', '2.G.  Other product manufacture and use ', '2.H.  Other ']

Subsectors for Sector 3:
['3.A.  Enteric fermentation', '3.B.  Manure management', '3.C.  Rice cultivation', '3.D.  Agricultural soils', '3.F.  Field burning of agricultural residues', '3.G. Liming', '3.H. Urea application', '3.I. Other carbon-containing fertilizers']

Subsectors for Sector 4:
['4.A. Forest land', '4

In [52]:
# Main execution
base = "data/crt/Sweden"
output_folder = "data/processed_data"
process_summary_sheet_subs("Summary2", base, output_folder)

Processing Summary2...

Processing Summary:
Found 1 total rows
Found 5 sector rows
Found 25 subsector rows
Found 6 sub-subsector rows
Found 5 memo items

Detailed breakdown:

Total categories:
['Total (net emissions) (1)']

Sectors:
['1. Energy', '2.  Industrial processes and product use', '3.  Agriculture', '4. Land use, land-use change and forestry (1)', '5.  Waste']

Subsectors for Sector 1:
['1.A. Fuel combustion', '1.B. Fugitive emissions from fuels']

Subsectors for Sector 2:
['2.A.  Mineral industry', '2.B.  Chemical industry', '2.C.  Metal industry', '2.D.  Non-energy products from fuels and solvent use', '2.F.  Product uses as ODS substitutes ', '2.G.  Other product manufacture and use ', '2.H.  Other ']

Subsectors for Sector 3:
['3.A.  Enteric fermentation', '3.B.  Manure management', '3.D.  Agricultural soils', '3.G. Liming', '3.H. Urea application']

Subsectors for Sector 4:
['4.A. Forest land', '4.B. Cropland', '4.C. Grassland', '4.D. Wetlands', '4.E. Settlements ', '4.F.

In [53]:
# Main execution
base_pa = "data/crt/Slovakia"
output_folder = "data/processed_data"
process_summary_sheet_subs("Summary2", base_pa, output_folder)

Processing Summary2...

Processing Summary:
Found 1 total rows
Found 5 sector rows
Found 22 subsector rows
Found 7 sub-subsector rows
Found 6 memo items

Detailed breakdown:

Total categories:
['Total (net emissions) (1)']

Sectors:
['1. Energy', '2.  Industrial processes and product use', '3.  Agriculture', '4. Land use, land-use change and forestry (1)', '5.  Waste']

Subsectors for Sector 1:
['1.A. Fuel combustion', '1.B. Fugitive emissions from fuels']

Subsectors for Sector 2:
['2.A.  Mineral industry', '2.B.  Chemical industry', '2.C.  Metal industry', '2.D.  Non-energy products from fuels and solvent use', '2.G.  Other product manufacture and use ']

Subsectors for Sector 3:
['3.A.  Enteric fermentation', '3.B.  Manure management', '3.D.  Agricultural soils', '3.G. Liming', '3.H. Urea application']

Subsectors for Sector 4:
['4.A. Forest land', '4.B. Cropland', '4.C. Grassland', '4.E. Settlements ', '4.F. Other land', '4.G. Harvested wood products']

Subsectors for Sector 5:
['5

In [54]:
# Main execution
bada = "data/crt/Switzerland"
output_folder = "data/processed_data"
process_summary_sheet_subs("Summary2", bada, output_folder)

Processing Summary2...

Processing Summary:
Found 1 total rows
Found 6 sector rows
Found 25 subsector rows
Found 6 sub-subsector rows
Found 7 memo items

Detailed breakdown:

Total categories:
['Total (net emissions) (1)']

Sectors:
['1. Energy', '2.  Industrial processes and product use', '3.  Agriculture', '4. Land use, land-use change and forestry (1)', '5.  Waste', '6.  Other (as specified in summary 1) ']

Subsectors for Sector 1:
['1.A. Fuel combustion', '1.B. Fugitive emissions from fuels']

Subsectors for Sector 2:
['2.A.  Mineral industry', '2.B.  Chemical industry', '2.C.  Metal industry', '2.D.  Non-energy products from fuels and solvent use', '2.F.  Product uses as ODS substitutes ', '2.G.  Other product manufacture and use ', '2.H.  Other ']

Subsectors for Sector 3:
['3.A.  Enteric fermentation', '3.B.  Manure management', '3.D.  Agricultural soils', '3.G. Liming', '3.H. Urea application']

Subsectors for Sector 4:
['4.A. Forest land', '4.B. Cropland', '4.C. Grassland', '

In [55]:
# Main execution
basa = "data/crt/Turkiye"
output_folder = "data/processed_data"
process_summary_sheet_subs("Summary2", basa, output_folder)

Processing Summary2...

Processing Summary:
Found 1 total rows
Found 5 sector rows
Found 21 subsector rows
Found 6 sub-subsector rows
Found 4 memo items

Detailed breakdown:

Total categories:
['Total (net emissions) (1)']

Sectors:
['1. Energy', '2.  Industrial processes and product use', '3.  Agriculture', '4. Land use, land-use change and forestry (1)', '5.  Waste']

Subsectors for Sector 1:
['1.A. Fuel combustion', '1.B. Fugitive emissions from fuels']

Subsectors for Sector 2:
['2.A.  Mineral industry', '2.B.  Chemical industry', '2.C.  Metal industry', '2.D.  Non-energy products from fuels and solvent use']

Subsectors for Sector 3:
['3.A.  Enteric fermentation', '3.B.  Manure management', '3.C.  Rice cultivation', '3.D.  Agricultural soils', '3.F.  Field burning of agricultural residues', '3.H. Urea application']

Subsectors for Sector 4:
['4.A. Forest land', '4.B. Cropland', '4.C. Grassland', '4.D. Wetlands', '4.G. Harvested wood products']

Subsectors for Sector 5:
['5.A.  Sol

In [56]:
# Main execution
back = "data/crt/Ukraine"
output_folder = "data/processed_data"
process_summary_sheet_subs("Summary2", back, output_folder)

Processing Summary2...

Processing Summary:
Found 1 total rows
Found 5 sector rows
Found 24 subsector rows
Found 7 sub-subsector rows
Found 6 memo items

Detailed breakdown:

Total categories:
['Total (net emissions) (1)']

Sectors:
['1. Energy', '2.  Industrial processes and product use', '3.  Agriculture', '4. Land use, land-use change and forestry (1)', '5.  Waste']

Subsectors for Sector 1:
['1.A. Fuel combustion', '1.B. Fugitive emissions from fuels']

Subsectors for Sector 2:
['2.A.  Mineral industry', '2.B.  Chemical industry', '2.C.  Metal industry', '2.D.  Non-energy products from fuels and solvent use', '2.G.  Other product manufacture and use ']

Subsectors for Sector 3:
['3.A.  Enteric fermentation', '3.B.  Manure management', '3.C.  Rice cultivation', '3.D.  Agricultural soils', '3.G. Liming', '3.H. Urea application']

Subsectors for Sector 4:
['4.A. Forest land', '4.B. Cropland', '4.C. Grassland', '4.D. Wetlands', '4.E. Settlements ', '4.F. Other land', '4.G. Harvested wo

In [58]:
# Main execution
bras = "data/crt/United States of America"
output_folder = "data/processed_data"
process_summary_sheet_subs("Summary2", bras, output_folder)

Processing Summary2...

Processing Summary:
Found 1 total rows
Found 5 sector rows
Found 25 subsector rows
Found 7 sub-subsector rows
Found 4 memo items

Detailed breakdown:

Total categories:
['Total (net emissions) (1)']

Sectors:
['1. Energy', '2.  Industrial processes and product use', '3.  Agriculture', '4. Land use, land-use change and forestry (1)', '5.  Waste']

Subsectors for Sector 1:
['1.A. Fuel combustion', '1.B. Fugitive emissions from fuels']

Subsectors for Sector 2:
['2.A.  Mineral industry', '2.B.  Chemical industry', '2.C.  Metal industry', '2.E.  Electronic Industry ', '2.F.  Product uses as ODS substitutes ', '2.G.  Other product manufacture and use ', '2.H.  Other ']

Subsectors for Sector 3:
['3.A.  Enteric fermentation', '3.B.  Manure management', '3.C.  Rice cultivation', '3.D.  Agricultural soils', '3.F.  Field burning of agricultural residues', '3.G. Liming', '3.H. Urea application']

Subsectors for Sector 4:
['4.A. Forest land', '4.B. Cropland', '4.C. Grassla

In [40]:
import os
import glob

# Check if you have parquet files
parquet_files = glob.glob("data/processed_data/*/total/*.parquet")
csv_files = glob.glob("data/processed_data/*/total/*.csv")

print(f"Parquet files found: {len(parquet_files)}")
print(f"CSV files found: {len(csv_files)}")


Parquet files found: 16
CSV files found: 0


In [None]:
import os

# Main execution
base_path = "data/crt"
output_folder = "data/processed_data"

# Loop through all subdirectories in data/crt
for country_dir in os.listdir(base_path):
    folder_path = os.path.join(base_path, country_dir)
    if os.path.isdir(folder_path):
        summary1_as1 = process_summary_sheet("Summary2", folder_path, output_folder)


Processing Summary2...

Processing Summary:
Found 1 total rows
Found 5 sector rows
Found 26 subsector rows
Found 7 sub-subsector rows
Found 5 memo items

Detailed breakdown:

Total categories:
['Total (net emissions) (1)']

Sectors:
['1. Energy', '2.  Industrial processes and product use', '3.  Agriculture', '4. Land use, land-use change and forestry (1)', '5.  Waste']

Subsectors for Sector 1:
['1.A. Fuel combustion', '1.B. Fugitive emissions from fuels']

Subsectors for Sector 2:
['2.A.  Mineral industry', '2.B.  Chemical industry', '2.C.  Metal industry', '2.D.  Non-energy products from fuels and solvent use', '2.G.  Other product manufacture and use ', '2.H.  Other ']

Subsectors for Sector 3:
['3.A.  Enteric fermentation', '3.B.  Manure management', '3.C.  Rice cultivation', '3.D.  Agricultural soils', '3.F.  Field burning of agricultural residues', '3.G. Liming', '3.H. Urea application']

Subsectors for Sector 4:
['4.A. Forest land', '4.B. Cropland', '4.C. Grassland', '4.D. Wetla

In [27]:
summary1_as2 = process_summary_sheet("Summary1.As2", folder_path, output_folder)


Processing Summary1.As2...

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 16 subsector rows
Detailed breakdown:

Sectors:
['3.  Agriculture', '4.  Land use, land-use change and forestry  (4)', '5.  Waste']

Subsectors for 3.  Agriculture:
['A.  Enteric fermentation', 'B.  Manure management', 'D.  Agricultural soils', 'F.  Field burning of agricultural residues', 'G. Liming', 'H. Urea application', 'J.  Other ', 'A. Forest land (4)', 'B. Cropland (4)']

Subsectors for 4.  Land use, land-use change and forestry  (4):
['D. Wetlands (4)', 'E. Settlements (4)', 'G. Harvested wood products ', 'H. Other  (4)', 'A.  Solid waste disposal (5)', 'B.  Biological treatment of solid waste (5)', 'C.  Incineration and open burning of waste (5)']
 processed Summary1.As2 for year 1990

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 16 subsector rows
Detailed breakdown:

Sectors:
['3.  Agriculture', '4.  Land use, land-use change and forestry  (4)', '5.  Waste']

Subs

In [30]:
summary1_as3 = process_summary_sheet("Summary1.As3", folder_path, output_folder)


Processing Summary1.As3...

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 1 subsector rows
Detailed breakdown:

Sectors:
['International bunkers', 'CO2 emissions from biomass', 'Indirect N2O']

Subsectors for International bunkers:
['Navigation']
 processed Summary1.As3 for year 1990

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 1 subsector rows
Detailed breakdown:

Sectors:
['International bunkers', 'CO2 emissions from biomass', 'Indirect N2O']

Subsectors for International bunkers:
['Navigation']
 processed Summary1.As3 for year 1991

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 1 subsector rows
Detailed breakdown:

Sectors:
['International bunkers', 'CO2 emissions from biomass', 'Indirect N2O']

Subsectors for International bunkers:
['Navigation']
 processed Summary1.As3 for year 1992

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 1 subsector rows
Detailed breakdown:

Sectors:
['International bunkers', 'CO

In [None]:
def combine_and_save_data(country_name='united kingdom'):
    """
    Combines yearly CSV files and saves them as combined CSV files
    """
    base_path = f"data/processed_data/{country_name}"
    output_path = f"data/processed_data/{country_name}/combined"
    os.makedirs(output_path, exist_ok=True)

    # Simple loop through each category
    for category in ['total', 'sectors', 'subsectors']:
        print(f"\nProcessing {category}...")
        
        # Get all CSV files for this category
        files = glob.glob(os.path.join(base_path, category, '*.parquet'))
        
        if files:
            # Read and combine all files
            dfs = []
            for file in files:
                print(f"Reading {file}")
                df = pd.read_csv(file)
                dfs.append(df)
            
            # Combine all dataframes
            combined = pd.concat(dfs, ignore_index=True)
            
            # Save combined file
            output_file = os.path.join(output_path, f'{country_name}_{category}_combined.parquet')
            combined.to_parquet(output_file, index=False)
            print(f"Saved combined {category} data to {output_file}")

# Run it
combine_and_save_data('united kingdom')


Processing total...

Processing sectors...

Processing subsectors...


In [15]:
import pandas as pd

def process_extreme_weather_data():
    # Read the data
    extreme_weather = pd.read_excel('data/EM-DATA/public_emdat_custom_request_2025-07-24_dd6057d4-ce3c-459c-871e-5a9e0a29eaf0.xlsx')

    # Define hazards (keeping all countries)
    hazards = ['Wildfire', 'Flood', 'Drought', 'Heatwave', 'Extreme temperature', 'Storm', 'Mass movement (wet)']
    
    # Filter for hazards only (no country filter)
    filtered_weather = extreme_weather[
        extreme_weather['Disaster Type'].isin(hazards)
    ]
    
    # Convert year to integer
    filtered_weather['Year'] = filtered_weather['Start Year'].astype(int)
    filtered_weather['Country'] = filtered_weather['Country'].replace('United Kingdom of Great Britain and Northern Ireland', 'United Kingdom')
    
    # Create summary using 'Country' instead of 'ISO'
    summary = (
        filtered_weather
        .groupby(["Country", "Year", "Disaster Type"])[
            ["Total Deaths", "Total Affected", "Total Damage ('000 US$)"]
        ]
        .sum()
        .reset_index()
    )
    
    # Save to CSV
    summary.to_parquet('data/EM-DATA/summary_extreme_weather_all_countries.parquet', index=False)
    
    return summary

# Process the data
summary_data = process_extreme_weather_data()
summary_data
# Print some basic information
print("Sample of the data:")
print(summary_data.head())
print("\nTotal number of countries:", summary_data['Country'].nunique())
print("\nList of countries:", sorted(summary_data['Country'].unique()))

Sample of the data:
       Country  Year        Disaster Type  Total Deaths  Total Affected  \
0  Afghanistan  1976                Flood          51.0         80000.0   
1  Afghanistan  1978                Flood         120.0        271684.0   
2  Afghanistan  1980                Flood           0.0         30000.0   
3  Afghanistan  1987  Mass movement (wet)          70.0             0.0   
4  Afghanistan  1988                Flood           0.0        161000.0   

   Total Damage ('000 US$)  
0                      0.0  
1                  52000.0  
2                      0.0  
3                      0.0  
4                 260000.0  

Total number of countries: 170

List of countries: ['Afghanistan', 'Albania', 'American Samoa', 'Anguilla', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Azores Islands', 'Bahamas', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Bermuda', 'Bhutan', 'Bolivia (Plurinational State of)', 'Bosnia and Herze

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_weather['Year'] = filtered_weather['Start Year'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_weather['Country'] = filtered_weather['Country'].replace('United Kingdom of Great Britain and Northern Ireland', 'United Kingdom')


In [14]:
import pandas as pd

# Load your CSV
df = pd.read_csv("data/EM-DATA/GLB.Ts+dSST.csv", skiprows=1) 

# Extract 'Year' and 'J-D' columns (J-D represents the annual mean)
df_filtered = df[['Year', 'J-D']].copy()

# Filter for years 1990-2024 to match emissions data
df_filtered = df_filtered[df_filtered['Year'].between(1990, 2024)]

# Rename column for clarity
df_filtered = df_filtered.rename(columns={'J-D': 'Temperature_Anomaly'})

# Save to new CSV
df_filtered.to_parquet("data/EM-DATA/global_temp_anomalies.parquet", index=False)

In [13]:
import pandas as pd

df = pd.read_csv("data/EM-DATA/owid-co2-data.csv")
global_df = df[df['country'] == 'World'] 
global_emissions = global_df[['year', 'co2', 'co2_including_luc', 'total_ghg']].copy()

global_emissions = global_emissions.dropna(subset=['co2'])
global_emissions.columns = global_emissions.columns.str.capitalize()
cols = global_emissions.columns.str.replace('Co2', 'CO2')
global_emissions.columns = cols  # Apply the column changes first

# FIXED: Reference the correct column name after capitalization
global_emissions['CO\u2082'] = global_emissions['CO2']  # Changed from 'CO22' to 'CO2'

global_emissions.to_parquet("data/EM-DATA/global_emissions.parquet", index=False)


In [30]:
global_emissions

Unnamed: 0,Year,CO2,CO2_including_luc,Total_ghg
49295,1750,9.306,,
49296,1751,9.407,,
49297,1752,9.505,,
49298,1753,9.610,,
49299,1754,9.734,,
...,...,...,...,...
49564,2019,37104.273,40891.020,52796.523
49565,2020,35126.527,38565.520,50792.543
49566,2021,36991.734,40544.516,52938.793
49567,2022,37293.836,40834.871,53327.215


In [None]:
def load_global_emission():
    """
    Load global emissions data
    """
    try:
        global_emissions = pd.read_csv("data/EM-DATA/global_emissions.csv")
        return global_emissions
    except:
        return None

In [54]:
# Main execution
base_folder = "data/crt"
output_folder = "data/processed_data"

# Get all country folders in annex_1
country_folders = [f for f in glob.glob(os.path.join(base_folder, "*")) if os.path.isdir(f)]

# Loop through each country folder
for country_folder in country_folders:
    country_name = os.path.basename(country_folder).lower()
    print(f"\nProcessing country: {country_name}")
    
    # Get the first Excel file to determine sheet names
    excel_files = glob.glob(os.path.join(country_folder, "*.xlsx"))
    if not excel_files:
        print(f"No Excel files found in {country_name}")
        continue
        
    # Get the sheet names from the first file
    try:
        data = pd.ExcelFile(excel_files[0])
        summary_sheets = [s for s in data.sheet_names if s.startswith("Summary2")]
        print(f"Found sheets for {country_name}: {summary_sheets}")
        
        # Process each summary sheet for this country
        for sheet in summary_sheets:
            try:
                print(f"\nProcessing {sheet} for {country_name}")
                process_summary_sheet(sheet, country_folder, output_folder)
            except Exception as e:
                print(f"Error processing {sheet} for {country_name}: {e}")
        
    except Exception as e:
        print(f"Error processing country {country_name}: {e}")

print("\nProcessing complete!")


Processing country: australia
Found sheets for australia: ['Summary2']

Processing Summary2 for australia
Processing Summary2...

Processing Summary:
Found 1 total rows
Found 5 sector rows
Found 26 subsector rows
Found 7 sub-subsector rows
Found 5 memo items

Detailed breakdown:

Total categories:
['Total (net emissions) (1)']

Sectors:
['1. Energy', '2.  Industrial processes and product use', '3.  Agriculture', '4. Land use, land-use change and forestry (1)', '5.  Waste']

Subsectors for Sector 1:
['1.A. Fuel combustion', '1.B. Fugitive emissions from fuels']

Subsectors for Sector 2:
['2.A.  Mineral industry', '2.B.  Chemical industry', '2.C.  Metal industry', '2.D.  Non-energy products from fuels and solvent use', '2.G.  Other product manufacture and use ', '2.H.  Other ']

Subsectors for Sector 3:
['3.A.  Enteric fermentation', '3.B.  Manure management', '3.C.  Rice cultivation', '3.D.  Agricultural soils', '3.F.  Field burning of agricultural residues', '3.G. Liming', '3.H. Urea 

In [55]:
##for subscripst use unicode characters so starting with 0=\2080
#so x1 = x\u2081
x1 = 'x\u2081'
#etc etc