In [2]:
import os
import glob
import pandas as pd

def detect_header_rows(filepath, sheet_name, keywords, lookahead=10):
    """
    Detects header rows in an Excel sheet based on keyword matching.

    """
    # Preview first few rows without assuming header position
    preview = pd.read_excel(filepath, sheet_name=sheet_name, nrows=lookahead, header=None)
    last_header_row = -1
    
    # Scan each row for any header keywords
    for i, row in preview.iterrows():
        for cell in row:
            if pd.notna(cell) and any(keyword.lower() in str(cell).lower() for keyword in keywords):
                last_header_row = max(last_header_row, i)
    
    if last_header_row >= 0:
        # Return all rows up to and including the header
        return list(range(last_header_row + 1))  
    else:
        raise ValueError("Could not detect headers using provided keywords.")

def read_excel_with_detected_header(filepath, sheet_name, keywords=["(kt)", "kt CO2 equivalent", "GREENHOUSE GAS"], flatten=True):
    """
    Reads Excel file with dynamic header detection and column cleaning.
    
    """
    # Detect header rows first
    header_rows = detect_header_rows(filepath, sheet_name, keywords)
    
    # Read with detected headers
    df = pd.read_excel(filepath, sheet_name=sheet_name, header=header_rows)
    
    # Clean multi-index columns if present
    if flatten and isinstance(df.columns, pd.MultiIndex):
        def clean_column(col):
            #Cleans multi-index column by keeping relevant parts"
            parts = [str(part).strip() for part in col if pd.notna(part)]
            # Keep only parts containing key GHG identifiers
            keep_parts = [p for p in parts if any(unit in p for unit in 
                          ['SINK CATEGORIES', 'CO2', 'CH4', 'N2O', 'SF6', 
                           'HFC', 'PFC', '(kt)', 'NF', 'NO', 'NMVOC', 'CO', 'SO'])]
            return ' '.join(keep_parts).strip()
        
        df.columns = [clean_column(col) for col in df.columns]
    
    return df

def extract_year_from_filename(filename):
    """
    Extracts year from filename following 'Report_Country_YYYY.xlsx' pattern.
    
    """
    parts = filename.split('-')
    if len(parts) > 3:

        country_code = parts[0]
        # Expects year in third position
        year = int(parts[4])
        return country_code, year  
    
def normalise_category(category):
    """Normalise category names by removing extra spaces or footnotes
    MAINLY FOR AUTSRALIA"""
    if isinstance(category, str):
        #  
        category = ' '.join(category.split())
        # Remove footnote numbers by splitting on '(' and taking first part
        category = category.split('(')[0].strip()
    return category

In [3]:
def get_category(category_str):
    category_str = category_str.strip()
    level, label = category_str.split(" ", 1)
    
    parts = level.strip('.').split(".")
    sector = parts[0] if len(parts) > 0 else None
    subsector = parts[1] if len(parts) > 1 else None
    sub_subsector = parts[2] if len(parts) > 2 else None
    sub_sub_subsector = parts[3] if len(parts) > 3 else None

    return {
        'sector': sector,
        'subsector': subsector,
        'sub_subsector': sub_subsector,
        'sub_sub_subsector': sub_sub_subsector,
        'label': label.strip()
    }


def process_hierarchical_data(df):
    category_col = None
    for col in df.columns:
        if 'GREENHOUSE GAS SOURCE AND SINK CATEGORIES' in col:
            category_col = col
            break
    if not category_col:
        raise ValueError("Category column not found")

    sectors, subsectors, sub_subsectors, labels, levels = [], [], [],[],[]
    for cat_str in df[category_col].astype(str):
        categories = get_category(cat_str)

        sectors.append(categories['sector'])
        subsectors.append(categories['subsector'])
        sub_subsectors.append(categories['sub_subsector'])
        labels.append(categories['label'])

        if categories['label'].lower == 'total national emissions and removals':
            levels.append('Total')
        elif categories['sector'] and not categories['subsector']:
            levels.append('Sector')
        elif categories['subsector'] and not categories['sub_subsector']:
            levels.append('Subsector')
        elif categories['sub_subsector']:
            levels.append('Sub_subsector')
        else:
            levels.append('Unknown')

    df = df.copy()
    df['Sector'] = sectors
    df['Subsector'] = subsectors
    df['Sub_subsector'] = sub_subsectors
    df['Label'] = labels
    df['Level'] = levels

    total_df = df[df['Level'] == 'Total']
    sector_df = df[df['Level'] == 'Sector']
    subsector_df = df[df['Level'] == 'Subsector']
    sub_subsector_df = df[df['Level'] == 'Sub-subsector']

    # Processing Summary
    print(f"\nProcessing Summary:")
    print(f"Found {0 if total_df.empty else len(total_df)} total rows")
    print(f"Found {0 if sector_df.empty else len(sector_df)} sector rows")
    print(f"Found {0 if subsector_df.empty else len(subsector_df)} subsector rows")
    print(f"Found {0 if sub_subsector_df.empty else len(sub_subsector_df)} sub-subsector rows")

    # Detailed breakdown
    print("\nDetailed breakdown:")
    
    if not total_df.empty:
        print("\nTotal categories:")
        print(total_df[category_col].tolist())
    
    if not sector_df.empty:
        print("\nSectors:")
        print(sector_df[category_col].tolist())
    
    if not subsector_df.empty:
        # Group subsectors by their parent sector
        for sector_num in subsector_df['Sector'].unique():
            sector_subsectors = subsector_df[subsector_df['Sector'] == sector_num]
            if not sector_subsectors.empty:
                print(f"\nSubsectors for Sector {sector_num}:")
                print(sector_subsectors[category_col].tolist())
    
    if not sub_subsector_df.empty:
        # Group sub-subsectors by their parent sector and subsector
        for sector_num in sub_subsector_df['Sector'].unique():
            for subsector_code in sub_subsector_df[sub_subsector_df['Sector'] == sector_num]['Subsector'].unique():
                detailed = sub_subsector_df[
                    (sub_subsector_df['Sector'] == sector_num) & 
                    (sub_subsector_df['Subsector'] == subsector_code)
                ]
                if not detailed.empty:
                    print(f"\nDetailed breakdowns for {sector_num}.{subsector_code}:")
                    print(detailed[category_col].tolist())

                    
    return total_df, sector_df, subsector_df, sub_subsector_df



In [4]:

def process_hierarchical_data(df, categories_dict):
    """
    Processes dataframe into hierarchical structure and saves separate CSVs
    """
    # Find the category column
    category_col = None
    for col in df.columns:
        if 'GREENHOUSE GAS SOURCE AND SINK CATEGORIES' in col:
            category_col = col
            break
    
    if not category_col:
        raise ValueError("Category column not found")

    # empty DataFrames, used none for total nad sector so empty csv is not created
    total_df = None
    sector_df = None
    subsector_dfs = {}

    # Process total emissions if present
    total_mask = df[category_col].isin(categories_dict['total_category'])
    if total_mask.any():
        total_df = df[total_mask].copy()
        total_df['Level'] = 'Total'
    
    # Process sectors and their subsectors
    for sector in categories_dict['sectors']:
        #  find the sector row
        sector_mask = df[category_col] == sector
        if sector_mask.any():
            # Get the sector row
            sector_rows = df[sector_mask].copy()
            sector_rows['Level'] = 'Sector'
            
            # Add to sector_df
            sector_df = pd.concat([sector_df, sector_rows])
            
            #  find all subsectors that belong to this sector
            # Get the index of the sector row
            sector_indices = df[sector_mask].index
            
            if len(sector_indices) > 0:
                sector_idx = sector_indices[0]
                
                # Find the next sector's index (if it exists)
                next_sector_mask = df[category_col].isin(categories_dict['sectors'])
                next_sector_indices = df[next_sector_mask].index
                next_sector_indices = next_sector_indices[next_sector_indices > sector_idx]
                
                end_idx = next_sector_indices[0] if len(next_sector_indices) > 0 else len(df)
                
                # Get all rows between this sector and the next - subsectors
                subsector_slice = df.iloc[sector_idx+1:end_idx].copy()
                
                # Remove any rows that are actually other main sectors
                subsector_slice = subsector_slice[~subsector_slice[category_col].isin(categories_dict['sectors'])]
                
                if not subsector_slice.empty:
                    subsector_slice['Level'] = 'Subsector'
                    subsector_slice['Parent_Sector'] = sector
                    subsector_dfs[sector] = subsector_slice

    print(f"\nProcessing Summary:")
    print(f"Found {0 if total_df is None else len(total_df)} total rows")
    print(f"Found {0 if sector_df is None else len(sector_df)} sector rows")
    print(f"Found {sum(len(df) for df in subsector_dfs.values() if df is not None)} subsector rows")
    
    #  logging
    print("Detailed breakdown:")
    if total_df is not None and not total_df.empty:
        #prevents empty df
        print("Total categories:")
        print(total_df[category_col].tolist())
    
    if sector_df is not None and not sector_df.empty:
        print("\nSectors:")
        print(sector_df[category_col].tolist())
    
    for sector, subsector_df in subsector_dfs.items():
        if subsector_df is not None and not subsector_df.empty:
            print(f"\nSubsectors for {sector}:")
            print(subsector_df[category_col].tolist())

    return total_df, sector_df, subsector_dfs

# Define categories dictionary
categories_dict = {
    'total_category': ["Total national emissions and removals"],
    'sectors': [
        "1. Energy", 
        "2.  Industrial processes and product use", 
        "3.  Agriculture", 
        "4.  Land use, land-use change and forestry  (4)", 
        "5.  Waste", 
        "6.  Other   (please specify)(6)",
        "Memo items:(7)",
        "International bunkers", 
        "Multilateral operations", 
        "CO2 emissions from biomass", 
        "CO2 captured", 
        "Long-term storage of C in waste disposal sites", 
        "Indirect N2O", 
        "Indirect CO2"  
    ],
    'subsectors': {
        '1. Energy': [
            "A. Fuel combustion",
            "                                           Sectoral approach(2)",
            "1.  Energy industries",
            "2.  Manufacturing industries and construction",
            "3.  Transport",
            "4.  Other sectors",
            "5.  Other",
            "B. Fugitive emissions from fuels",
            "1.  Solid fuels"
            "2.  Oil and natural gas and other emissions from energy production",
            "C. CO2 Transport and storage", 
        ],
        '2.  Industrial processes and product use': [
            "A.  Mineral industry",
            "B.  Chemical industry",
            "C.  Metal industry",
            "D.  Non-energy products",
            "E.  Electronic industry",
            "F.  Product uses",
            "G.  Other product manufacture",
            "H.  Other"
        ],
        '3.  Agriculture': [
            "A.  Enteric fermentation",
            "B.  Manure management",
            "C.  Rice cultivation",
            "D.  Agricultural soils",
            "E.  Prescribed burning",
            "F.  Field burning",
            "G. Liming",
            "H. Urea application"
        ],
        "4.  Land use, land-use change and forestry  (4)": [
            "A. Forest land (4)"
            "B. Cropland (4)"
            "C. Grassland (4)"
            "D. Wetlands (4)"
            "E. Settlements (4)"
            "F. Other land (4)"
            "G. Harvested wood products" 
            "H. Other  (4)"
        ],

        "5.  Waste": [ 
        "A.  Solid waste disposal (5)"
        "B.  Biological treatment of solid waste (5)"
        "C.  Incineration and open burning of waste (5)"
        "D.  Wastewater treatment and discharge"
        "E.  Other (5)"
        ],

        "6.  Other   (please specify)(6)": [],
        "Memo items:(7)": [],
        "International bunkers": [
            "Aviation"
            "Navigation"
        ],

        "Multilateral operations": [], 
        "CO2 emissions from biomass":[], 
        "CO2 captured":[],
        "Long-term storage of C in waste disposal sites":[], 
        "Indirect N2O":[],
        "Indirect CO2":[],
        
    }
}


In [4]:
def process_summary_sheet(sheet_name, folder_path, output_folder):
    """
    Process a specific summary sheet from all Excel files in the folder
    """
    print(f"Processing {sheet_name}...")
    
    # Get country name from the folder path
    country_name = os.path.basename(folder_path).lower()

    for filepath in glob.glob(os.path.join(folder_path, "*.xlsx")):
        try:
            # Read the specific summary sheet
            df = read_excel_with_detected_header(filepath, sheet_name)
            
            # Standardise column names
            new_columns = []
            for col in df.columns:
                # Standardise CO2 column names
                if col == 'CO2' or col == 'CO2 (kt)' or col == 'Net CO2 (kt)':
                    new_columns.append('Net CO2 emissions/removals (kt)')
                #other gases just in case
                elif col == 'CH4' or col == 'CH4 (kt)':
                    new_columns.append('CH4 (kt)')
                elif col == 'N2O' or col == 'N2O (kt)':
                    new_columns.append('N2O (kt)')
                else:
                    new_columns.append(col)
            df.columns = new_columns

            # Find the category column
            category_col = None
            for col in df.columns:
                if 'GREENHOUSE GAS SOURCE AND SINK CATEGORIES' in col:
                    category_col = col
                    break
            
            if not category_col:
                raise ValueError("Category column not found")

            # Keep the category column
            categories = df[category_col]

            # Convert other columns to numeric, coercing errors to NaN
            numeric_df = df.drop(columns=[category_col]).apply(pd.to_numeric, errors='coerce')

            # Recombine the category column back
            df = pd.concat([categories, numeric_df], axis=1)

            # Drop rows that are completely empty but keep the category column
            df = df.dropna(axis=0, how='all', subset=numeric_df.columns)
            
            # Drop columns that are completely empty
            df = df.dropna(axis=1, how='all')

            # Define years and GHG keywords
            years_table10s6 = [str(year) for year in range(1990, 2023+1)]
            ghg_keywords = ['CO2', 'CH4', 'N2O', 'SF6', 'HFC', 'PFC',
                          'Base year (1)', 'Change from base to latest'] + years_table10s6

            # Keep category column containing any GHG keywords
            cols_to_keep = [category_col] + [col for col in df.columns 
                                           if any(k in col for k in ghg_keywords)]

            df = df[cols_to_keep]
            
            # Extract year
            country_code, year = extract_year_from_filename(os.path.basename(filepath))
            if not country_code or not year:
                print(f"Skipping {filepath}: Could not extract country code or year")
                continue
            
            # Add sheet name to DataFrame
            df['Sheet'] = sheet_name

            # Create directories using country name from folder
            country_output = os.path.join(output_folder, country_name)
            os.makedirs(os.path.join(country_output, 'total'), exist_ok=True)
            os.makedirs(os.path.join(country_output, 'sectors'), exist_ok=True)
            os.makedirs(os.path.join(country_output, 'subsectors'), exist_ok=True)
            os.makedirs(os.path.join(country_output, 'sub_subsectors'), exist_ok=True)

            df['Country'] = country_name.upper()
            df['Year'] = year
            
            # Process into hierarchical structure
            total_df, sector_df, subsector_df, sub_subsector_df = process_hierarchical_data(df)
            
            # Create filename with sheet name and year
            base_filename = f"{country_code.lower()}_{sheet_name}_{year}"
            
            # Save files
            if not total_df.empty:
                total_df.to_csv(
                    os.path.join(country_output, 'total', f"{base_filename}_total.csv"), 
                    index=False
                )
            
            if not sector_df.empty:
                sector_df.to_csv(
                    os.path.join(country_output, 'sectors', f"{base_filename}_sectors.csv"), 
                    index=False
                )
            
            if not subsector_df.empty:
                subsector_df.to_csv(
                    os.path.join(country_output, 'subsectors', f"{base_filename}_subsectors.csv"),
                    index=False
                )

            if not sub_subsector_df.empty:
                sub_subsector_df.to_csv(
                    os.path.join(country_output, 'sub_subsectors', f"{base_filename}_sub_subsectors.csv"),
                    index=False
                )

            print(f"Processed {sheet_name} for year {year}")
            
        except Exception as e:
            print(f"Error processing {sheet_name} in {filepath}: {e}")

In [None]:
# Main execution
folder_path = "data/crt/United Kingdom"
output_folder = "data/processed_data"
summary1_as1 = process_summary_sheet("Summary1", folder_path, output_folder)

Processing Summary1.As1...

Processing Summary:
Found 1 total rows
Found 2 sector rows
Found 17 subsector rows
Detailed breakdown:
Total categories:
['Total national emissions and removals']

Sectors:
['1. Energy', '2.  Industrial processes and product use']

Subsectors for 1. Energy:
['A. Fuel combustion     Reference approach(2)', '                                           Sectoral approach(2)', '1.  Energy industries', '2.  Manufacturing industries and construction                          ', '3.  Transport', '4.  Other sectors', '5.  Other', 'B. Fugitive emissions from fuels', '1.  Solid fuels', '2.  Oil and natural gas and other emissions from energy production']

Subsectors for 2.  Industrial processes and product use:
['B.  Chemical industry', 'C.  Metal industry', 'D.  Non-energy products from fuels and solvent use ', 'E.  Electronic industry ', 'F.  Product uses as substitutes for ODS', 'G.  Other product manufacture and use ', 'H.  Other(3)']
 processed Summary1.As1 for year

In [27]:
summary1_as2 = process_summary_sheet("Summary1.As2", folder_path, output_folder)


Processing Summary1.As2...

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 16 subsector rows
Detailed breakdown:

Sectors:
['3.  Agriculture', '4.  Land use, land-use change and forestry  (4)', '5.  Waste']

Subsectors for 3.  Agriculture:
['A.  Enteric fermentation', 'B.  Manure management', 'D.  Agricultural soils', 'F.  Field burning of agricultural residues', 'G. Liming', 'H. Urea application', 'J.  Other ', 'A. Forest land (4)', 'B. Cropland (4)']

Subsectors for 4.  Land use, land-use change and forestry  (4):
['D. Wetlands (4)', 'E. Settlements (4)', 'G. Harvested wood products ', 'H. Other  (4)', 'A.  Solid waste disposal (5)', 'B.  Biological treatment of solid waste (5)', 'C.  Incineration and open burning of waste (5)']
 processed Summary1.As2 for year 1990

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 16 subsector rows
Detailed breakdown:

Sectors:
['3.  Agriculture', '4.  Land use, land-use change and forestry  (4)', '5.  Waste']

Subs

In [30]:
summary1_as3 = process_summary_sheet("Summary1.As3", folder_path, output_folder)


Processing Summary1.As3...

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 1 subsector rows
Detailed breakdown:

Sectors:
['International bunkers', 'CO2 emissions from biomass', 'Indirect N2O']

Subsectors for International bunkers:
['Navigation']
 processed Summary1.As3 for year 1990

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 1 subsector rows
Detailed breakdown:

Sectors:
['International bunkers', 'CO2 emissions from biomass', 'Indirect N2O']

Subsectors for International bunkers:
['Navigation']
 processed Summary1.As3 for year 1991

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 1 subsector rows
Detailed breakdown:

Sectors:
['International bunkers', 'CO2 emissions from biomass', 'Indirect N2O']

Subsectors for International bunkers:
['Navigation']
 processed Summary1.As3 for year 1992

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 1 subsector rows
Detailed breakdown:

Sectors:
['International bunkers', 'CO

In [6]:
def combine_and_save_data(country_name='united kingdom'):
    """
    Combines yearly CSV files and saves them as combined CSV files
    """
    base_path = f"data/processed_data/{country_name}"
    output_path = f"data/processed_data/{country_name}/combined"
    os.makedirs(output_path, exist_ok=True)

    # Simple loop through each category
    for category in ['total', 'sectors', 'subsectors']:
        print(f"\nProcessing {category}...")
        
        # Get all CSV files for this category
        files = glob.glob(os.path.join(base_path, category, '*.csv'))
        
        if files:
            # Read and combine all files
            dfs = []
            for file in files:
                print(f"Reading {file}")
                df = pd.read_csv(file)
                dfs.append(df)
            
            # Combine all dataframes
            combined = pd.concat(dfs, ignore_index=True)
            
            # Save combined file
            output_file = os.path.join(output_path, f'{country_name}_{category}_combined.csv')
            combined.to_csv(output_file, index=False)
            print(f"Saved combined {category} data to {output_file}")

# Run it
combine_and_save_data('united kingdom')


Processing total...

Processing sectors...

Processing subsectors...


In [14]:
#test diff countries
# Main execution
aut_path = "data/aut"
output_folder = "data/processed_data"
summary1_as1 = process_summary_sheet("Summary1.As1", aut_path, output_folder)
summary1_as2 = process_summary_sheet("Summary1.As2", aut_path, output_folder)
summary1_as3 = process_summary_sheet("Summary1.As3", aut_path, output_folder)

Processing Summary1.As1...

Processing Summary:
Found 1 total rows
Found 2 sector rows
Found 16 subsector rows

Detailed breakdown:

Total categories:
['Total national emissions and removals']

Sectors:
['1. Energy', '2.  Industrial processes and product use']

Subsectors for 1. Energy:
['A. Fuel combustion     Reference approach(2)', '                                           Sectoral approach(2)', '1.  Energy industries', '2.  Manufacturing industries and construction                          ', '3.  Transport', '4.  Other sectors', '5.  Other', 'B. Fugitive emissions from fuels', '1.  Solid fuels', '2.  Oil and natural gas and other emissions from energy production']

Subsectors for 2.  Industrial processes and product use:
['B.  Chemical industry', 'C.  Metal industry', 'D.  Non-energy products from fuels and solvent use ', 'E.  Electronic industry ', 'G.  Other product manufacture and use ', 'H.  Other(3)']
Successfully processed Summary1.As1 for year 1990

Processing Summary:
Fo

In [15]:
combine_and_save_data('aut')


Processing total...
Reading data/processed_data/aut\total\aut_Summary1.As1_1990_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1991_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1992_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1993_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1994_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1995_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1996_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1997_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1998_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1999_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_2000_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_2001_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_2002_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_2003_total.csv

In [16]:
#test diff countries
# Main execution
blr_path = "data/blr"
output_folder = "data/processed_data"
blr_summary1_as1 = process_summary_sheet("Summary1.As1", blr_path, output_folder)
blr_summary1_as2 = process_summary_sheet("Summary1.As2", blr_path, output_folder)
blr_summary1_as3 = process_summary_sheet("Summary1.As3", blr_path, output_folder)

Processing Summary1.As1...

Processing Summary:
Found 1 total rows
Found 2 sector rows
Found 14 subsector rows

Detailed breakdown:

Total categories:
['Total national emissions and removals']

Sectors:
['1. Energy', '2.  Industrial processes and product use']

Subsectors for 1. Energy:
['A. Fuel combustion     Reference approach(2)', '                                           Sectoral approach(2)', '1.  Energy industries', '2.  Manufacturing industries and construction                          ', '3.  Transport', '4.  Other sectors', '5.  Other', 'B. Fugitive emissions from fuels', '2.  Oil and natural gas and other emissions from energy production', 'A.  Mineral industry']

Subsectors for 2.  Industrial processes and product use:
['C.  Metal industry', 'D.  Non-energy products from fuels and solvent use ', 'G.  Other product manufacture and use ', 'H.  Other(3)']
Successfully processed Summary1.As1 for year 1990

Processing Summary:
Found 1 total rows
Found 2 sector rows
Found 14 su

In [17]:
combine_and_save_data('blr')


Processing total...
Reading data/processed_data/blr\total\blr_Summary1.As1_1990_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1991_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1992_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1993_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1994_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1995_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1996_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1997_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1998_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1999_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_2000_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_2001_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_2002_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_2003_total.csv

In [18]:
#test diff countries
# Main execution
can_path = "data/can"
output_folder = "data/processed_data"
can_summary1_as1 = process_summary_sheet("Summary1.As1", can_path, output_folder)
can_summary1_as2 = process_summary_sheet("Summary1.As2", can_path, output_folder)
can_summary1_as3 = process_summary_sheet("Summary1.As3", can_path, output_folder)

Processing Summary1.As1...

Processing Summary:
Found 1 total rows
Found 2 sector rows
Found 15 subsector rows

Detailed breakdown:

Total categories:
['Total national emissions and removals']

Sectors:
['1. Energy', '2.  Industrial processes and product use']

Subsectors for 1. Energy:
['A. Fuel combustion     Reference approach(2)', '                                           Sectoral approach(2)', '1.  Energy industries', '2.  Manufacturing industries and construction                          ', '3.  Transport', '4.  Other sectors', '5.  Other', 'B. Fugitive emissions from fuels', '1.  Solid fuels', '2.  Oil and natural gas and other emissions from energy production']

Subsectors for 2.  Industrial processes and product use:
['B.  Chemical industry', 'C.  Metal industry', 'D.  Non-energy products from fuels and solvent use ', 'E.  Electronic industry ', 'G.  Other product manufacture and use ']
Successfully processed Summary1.As1 for year 1990

Processing Summary:
Found 1 total rows

In [19]:
combine_and_save_data('can')


Processing total...
Reading data/processed_data/can\total\can_Summary1.As1_1990_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1991_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1992_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1993_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1994_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1995_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1996_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1997_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1998_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1999_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_2000_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_2001_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_2002_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_2003_total.csv

In [20]:
#test diff countries
# Main execution
est_path = "data/est"
output_folder = "data/processed_data"
est_summary1_as1 = process_summary_sheet("Summary1.As1", est_path, output_folder)
est_summary1_as2 = process_summary_sheet("Summary1.As2", est_path, output_folder)
est_summary1_as3 = process_summary_sheet("Summary1.As3", est_path, output_folder)

Processing Summary1.As1...

Processing Summary:
Found 1 total rows
Found 2 sector rows
Found 13 subsector rows

Detailed breakdown:

Total categories:
['Total national emissions and removals']

Sectors:
['1. Energy', '2.  Industrial processes and product use']

Subsectors for 1. Energy:
['A. Fuel combustion     Reference approach(2)', '                                           Sectoral approach(2)', '1.  Energy industries', '2.  Manufacturing industries and construction                          ', '3.  Transport', '4.  Other sectors', 'B. Fugitive emissions from fuels', '2.  Oil and natural gas and other emissions from energy production', 'A.  Mineral industry', 'B.  Chemical industry']

Subsectors for 2.  Industrial processes and product use:
['D.  Non-energy products from fuels and solvent use ', 'G.  Other product manufacture and use ', 'H.  Other(3)']
Successfully processed Summary1.As1 for year 1990

Processing Summary:
Found 1 total rows
Found 2 sector rows
Found 13 subsector ro

In [21]:
combine_and_save_data('est')


Processing total...
Reading data/processed_data/est\total\est_Summary1.As1_1990_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1991_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1992_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1993_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1994_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1995_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1996_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1997_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1998_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1999_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_2000_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_2001_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_2002_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_2003_total.csv

In [22]:
#test diff countries
# Main execution
usa_path = "data/usa"
output_folder = "data/processed_data"
usa_summary1_as1 = process_summary_sheet("Summary1.As1", usa_path, output_folder)
usa_summary1_as2 = process_summary_sheet("Summary1.As2", usa_path, output_folder)
usa_summary1_as3 = process_summary_sheet("Summary1.As3", usa_path, output_folder)

Processing Summary1.As1...

Processing Summary:
Found 1 total rows
Found 2 sector rows
Found 16 subsector rows

Detailed breakdown:

Total categories:
['Total national emissions and removals']

Sectors:
['1. Energy', '2.  Industrial processes and product use']

Subsectors for 1. Energy:
['A. Fuel combustion     Reference approach(2)', '                                           Sectoral approach(2)', '1.  Energy industries', '2.  Manufacturing industries and construction                          ', '3.  Transport', '4.  Other sectors', '5.  Other', 'B. Fugitive emissions from fuels', '1.  Solid fuels', '2.  Oil and natural gas and other emissions from energy production']

Subsectors for 2.  Industrial processes and product use:
['B.  Chemical industry', 'C.  Metal industry', 'E.  Electronic industry ', 'F.  Product uses as substitutes for ODS', 'G.  Other product manufacture and use ', 'H.  Other(3)']
Successfully processed Summary1.As1 for year 1990

Processing Summary:
Found 1 total 

In [23]:
combine_and_save_data('usa')


Processing total...
Reading data/processed_data/usa\total\usa_Summary1.As1_1990_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1991_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1992_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1993_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1994_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1995_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1996_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1997_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1998_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1999_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_2000_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_2001_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_2002_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_2003_total.csv

In [None]:
import pandas as pd

def process_extreme_weather_data():
    # Read the data
    extreme_weather = pd.read_excel('data/EM-DATA/public_emdat_custom_request_2025-06-26_9e81f8a0-ef5b-49db-a30f-1546c51d9c78.xlsx')

    # Define countries and hazards
    countries = ['EST', 'AUT', 'CAN', 'BLR', 'USA', 'GBR']
    hazards = ['Wildfire', 'Flood', 'Drought', 'Heatwave', 'Extreme temperature', 'Storm']
    
    # Filter for our countries and hazards
    filtered_weather = extreme_weather[
        (extreme_weather['ISO'].isin(countries)) & 
        (extreme_weather['Disaster Type'].isin(hazards))
    ]
    
    # Convert year to integer
    filtered_weather['year'] = filtered_weather['Start Year'].astype(int)
    
    # Create summary
    summary = (
    filtered_weather
    .groupby(["ISO", "year", "Disaster Type"])[
        ["Total Deaths", "Total Affected", "Total Damage ('000 US$)"]
    ]
    .sum()
    .reset_index()
)
    # Save to CSV
    summary.to_csv('data/EM-DATA/summary_extreme_weather_all_countries.csv', index=False)
    
    return summary

# Process the data
summary_data = process_extreme_weather_data()

# Print some basic information
print("of the data:")
print(summary_data.head())



  print("\ of the data:")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_weather['year'] = filtered_weather['Start Year'].astype(int)


\ of the data:
   ISO  year Disaster Type  Total Deaths  Total Affected  \
0  AUT  1990         Storm           3.0             0.0   
1  AUT  1991         Flood           5.0             0.0   
2  AUT  1991         Storm           0.0             0.0   
3  AUT  1993         Storm           0.0             0.0   
4  AUT  1994         Storm           0.0             0.0   

   Total Damage ('000 US$)  
0                 250000.0  
1                  82000.0  
2                  28000.0  
3                   1000.0  
4                   2000.0  

UK data sample:
     ISO  year        Disaster Type  Total Deaths  Total Affected  \
125  GBR  1990                Storm          83.0             0.0   
126  GBR  1991  Extreme temperature           0.0             0.0   
127  GBR  1991                Storm          48.0             0.0   
128  GBR  1993                Flood           4.0          6000.0   
129  GBR  1994                Flood           7.0           700.0   

     Total Damage 

In [11]:
import pandas as pd

# Load your CSV
df = pd.read_csv("data/EM-DATA/GLB.Ts+dSST.csv", skiprows=1) 

# Extract 'Year' and 'J-D' columns (J-D represents the annual mean)
df_filtered = df[['Year', 'J-D']].copy()

# Filter for years 1990-2024 to match emissions data
df_filtered = df_filtered[df_filtered['Year'].between(1990, 2024)]

# Rename column for clarity
df_filtered = df_filtered.rename(columns={'J-D': 'Temperature_Anomaly'})

# Save to new CSV
df_filtered.to_csv("data/EM-DATA/global_temp_anomalies.csv", index=False)

In [31]:
import pandas as pd

df = pd.read_csv("data/EM-DATA/owid-co2-data.csv")
global_df = df[df['country'] == 'World'] 
global_emissions = global_df[['year', 'co2', 'co2_including_luc', 'total_ghg']].copy()
global_emissions = global_emissions.dropna(subset=['co2'])
global_emissions.columns = global_emissions.columns.str.capitalize()
cols = global_emissions.columns.str.replace('Co2', 'CO2')
global_emissions.columns = cols
global_emissions.to_csv("data/EM-DATA/global_emissions.csv", index=False)


In [30]:
global_emissions

Unnamed: 0,Year,CO2,CO2_including_luc,Total_ghg
49295,1750,9.306,,
49296,1751,9.407,,
49297,1752,9.505,,
49298,1753,9.610,,
49299,1754,9.734,,
...,...,...,...,...
49564,2019,37104.273,40891.020,52796.523
49565,2020,35126.527,38565.520,50792.543
49566,2021,36991.734,40544.516,52938.793
49567,2022,37293.836,40834.871,53327.215


In [None]:
def load_global_emission():
    """
    Load global emissions data
    """
    try:
        global_emissions = pd.read_csv("data/EM-DATA/global_emissions.csv")
        return global_emissions
    except:
        return None

In [None]:
# Main execution
base_folder = "data/annex_1"
output_folder = "data/processed_data"

# Get all country folders in annex_1
country_folders = [f for f in glob.glob(os.path.join(base_folder, "*")) if os.path.isdir(f)]

# Loop through each country folder
for country_folder in country_folders:
    country_name = os.path.basename(country_folder).lower()
    print(f"\nProcessing country: {country_name}")
    
    # Get the first Excel file to determine sheet names
    excel_files = glob.glob(os.path.join(country_folder, "*.xlsx"))
    if not excel_files:
        print(f"No Excel files found in {country_name}")
        continue
        
    # Get the sheet names from the first file
    try:
        data = pd.ExcelFile(excel_files[0])
        summary_sheets = [s for s in data.sheet_names if s.startswith("Summary1")]
        print(f"Found sheets for {country_name}: {summary_sheets}")
        
        # Process each summary sheet for this country
        for sheet in summary_sheets:
            try:
                print(f"\nProcessing {sheet} for {country_name}")
                process_summary_sheet(sheet, country_folder, output_folder)
            except Exception as e:
                print(f"Error processing {sheet} for {country_name}: {e}")
        
        # After processing all sheets for a country, combine the data
        try:
            print(f"\nCombining data for {country_name}")
            combine_and_save_data(country_name)
        except Exception as e:
            print(f"Error combining data for {country_name}: {e}")
            
    except Exception as e:
        print(f"Error processing country {country_name}: {e}")

print("\nProcessing complete!")


Processing country: austria
Found sheets for austria: ['Summary1.As1', 'Summary1.As2', 'Summary1.As3']

Processing Summary1.As1 for austria
Processing Summary1.As1...

Processing Summary:
Found 1 total rows
Found 2 sector rows
Found 16 subsector rows
Detailed breakdown:
Total categories:
['Total national emissions and removals']

Sectors:
['1. Energy', '2.  Industrial processes and product use']

Subsectors for 1. Energy:
['A. Fuel combustion     Reference approach(2)', '                                           Sectoral approach(2)', '1.  Energy industries', '2.  Manufacturing industries and construction                          ', '3.  Transport', '4.  Other sectors', '5.  Other', 'B. Fugitive emissions from fuels', '1.  Solid fuels', '2.  Oil and natural gas and other emissions from energy production']

Subsectors for 2.  Industrial processes and product use:
['B.  Chemical industry', 'C.  Metal industry', 'D.  Non-energy products from fuels and solvent use ', 'E.  Electronic indu