In [None]:
import os
import glob
import pandas as pd

def detect_header_rows(filepath, sheet_name, keywords, lookahead=10):
    """
    Detects header rows in an Excel sheet based on keyword matching.

    """
    # Preview first few rows without assuming header position
    preview = pd.read_excel(filepath, sheet_name=sheet_name, nrows=lookahead, header=None)
    last_header_row = -1
    
    # Scan each row for any header keywords
    for i, row in preview.iterrows():
        for cell in row:
            if pd.notna(cell) and any(keyword.lower() in str(cell).lower() for keyword in keywords):
                last_header_row = max(last_header_row, i)
    
    if last_header_row >= 0:
        # Return all rows up to and including the header
        return list(range(last_header_row + 1))  
    else:
        raise ValueError("Could not detect headers using provided keywords.")

def read_excel_with_detected_header(filepath, sheet_name, keywords=["(kt)", "kt CO2 equivalent", "GREENHOUSE GAS"], flatten=True):
    """
    Reads Excel file with dynamic header detection and column cleaning.
    
    """
    # Detect header rows first
    header_rows = detect_header_rows(filepath, sheet_name, keywords)
    
    # Read with detected headers
    df = pd.read_excel(filepath, sheet_name=sheet_name, header=header_rows)
    
    # Clean multi-index columns if present
    if flatten and isinstance(df.columns, pd.MultiIndex):
        def clean_column(col):
            #Cleans multi-index column by keeping relevant parts"
            parts = [str(part).strip() for part in col if pd.notna(part)]
            # Keep only parts containing key GHG identifiers
            keep_parts = [p for p in parts if any(unit in p for unit in 
                          ['SINK CATEGORIES', 'CO2', 'CH4', 'N2O', 'SF6', 
                           'HFC', 'PFC', '(kt)', 'NF', 'NO', 'NMVOC', 'CO', 'SO'])]
            return ' '.join(keep_parts).strip()
        
        df.columns = [clean_column(col) for col in df.columns]
    
    return df

def extract_year_from_filename(filename):
    """
    Extracts year from filename following 'Report_Country_YYYY.xlsx' pattern.
    
    """
    parts = filename.split('_')
    if len(parts) > 3:

        country_code = parts[0]
        # Expects year in third position
        year = int(parts[2])
        return country_code, year  


In [None]:

def process_hierarchical_data(df, categories_dict):
    """
    Processes dataframe into hierarchical structure and saves separate CSVs
    """
    # Find the category column
    category_col = None
    for col in df.columns:
        if 'GREENHOUSE GAS SOURCE AND SINK CATEGORIES' in col:
            category_col = col
            break
    
    if not category_col:
        raise ValueError("Category column not found")

    # empty DataFrames, used none for total nad sector so empty csv is not created
    total_df = None
    sector_df = None
    subsector_dfs = {}

    # Process total emissions if present
    total_mask = df[category_col].isin(categories_dict['total_category'])
    if total_mask.any():
        total_df = df[total_mask].copy()
        total_df['Level'] = 'Total'
    
    # Process sectors and their subsectors
    for sector in categories_dict['sectors']:
        #  find the sector row
        sector_mask = df[category_col] == sector
        if sector_mask.any():
            # Get the sector row
            sector_rows = df[sector_mask].copy()
            sector_rows['Level'] = 'Sector'
            
            # Add to sector_df
            sector_df = pd.concat([sector_df, sector_rows])
            
            #  find all subsectors that belong to this sector
            # Get the index of the sector row
            sector_indices = df[sector_mask].index
            
            if len(sector_indices) > 0:
                sector_idx = sector_indices[0]
                
                # Find the next sector's index (if it exists)
                next_sector_mask = df[category_col].isin(categories_dict['sectors'])
                next_sector_indices = df[next_sector_mask].index
                next_sector_indices = next_sector_indices[next_sector_indices > sector_idx]
                
                end_idx = next_sector_indices[0] if len(next_sector_indices) > 0 else len(df)
                
                # Get all rows between this sector and the next - subsectors
                subsector_slice = df.iloc[sector_idx+1:end_idx].copy()
                
                # Remove any rows that are actually other main sectors
                subsector_slice = subsector_slice[~subsector_slice[category_col].isin(categories_dict['sectors'])]
                
                if not subsector_slice.empty:
                    subsector_slice['Level'] = 'Subsector'
                    subsector_slice['Parent_Sector'] = sector
                    subsector_dfs[sector] = subsector_slice

    print(f"\nProcessing Summary:")
    print(f"Found {0 if total_df is None else len(total_df)} total rows")
    print(f"Found {0 if sector_df is None else len(sector_df)} sector rows")
    print(f"Found {sum(len(df) for df in subsector_dfs.values() if df is not None)} subsector rows")
    
    #  logging
    print("Detailed breakdown:")
    if total_df is not None and not total_df.empty:
        #prevents empty df
        print("Total categories:")
        print(total_df[category_col].tolist())
    
    if sector_df is not None and not sector_df.empty:
        print("\nSectors:")
        print(sector_df[category_col].tolist())
    
    for sector, subsector_df in subsector_dfs.items():
        if subsector_df is not None and not subsector_df.empty:
            print(f"\nSubsectors for {sector}:")
            print(subsector_df[category_col].tolist())

    return total_df, sector_df, subsector_dfs

# Define categories dictionary
categories_dict = {
    'total_category': ["Total national emissions and removals"],
    'sectors': [
        "1. Energy", 
        "2.  Industrial processes and product use", 
        "3.  Agriculture", 
        "4.  Land use, land-use change and forestry  (4)", 
        "5.  Waste", 
        "6.  Other   (please specify)(6)",
        "Memo items:(7)",
        "International bunkers", 
        "Multilateral operations", 
        "CO2 emissions from biomass", 
        "CO2 captured", 
        "Long-term storage of C in waste disposal sites", 
        "Indirect N2O", 
        "Indirect CO2"  
    ],
    'subsectors': {
        '1. Energy': [
            "A. Fuel combustion",
            "                                           Sectoral approach(2)",
            "1.  Energy industries",
            "2.  Manufacturing industries and construction",
            "3.  Transport",
            "4.  Other sectors",
            "5.  Other",
            "B. Fugitive emissions from fuels",
            "1.  Solid fuels"
            "2.  Oil and natural gas and other emissions from energy production",
            "C. CO2 Transport and storage", 
        ],
        '2.  Industrial processes and product use': [
            "A.  Mineral industry",
            "B.  Chemical industry",
            "C.  Metal industry",
            "D.  Non-energy products",
            "E.  Electronic industry",
            "F.  Product uses",
            "G.  Other product manufacture",
            "H.  Other"
        ],
        '3.  Agriculture': [
            "A.  Enteric fermentation",
            "B.  Manure management",
            "C.  Rice cultivation",
            "D.  Agricultural soils",
            "E.  Prescribed burning",
            "F.  Field burning",
            "G. Liming",
            "H. Urea application"
        ],
        "4.  Land use, land-use change and forestry  (4)": [
            "A. Forest land (4)"
            "B. Cropland (4)"
            "C. Grassland (4)"
            "D. Wetlands (4)"
            "E. Settlements (4)"
            "F. Other land (4)"
            "G. Harvested wood products" 
            "H. Other  (4)"
        ],

        "5.  Waste": [ 
        "A.  Solid waste disposal (5)"
        "B.  Biological treatment of solid waste (5)"
        "C.  Incineration and open burning of waste (5)"
        "D.  Wastewater treatment and discharge"
        "E.  Other (5)"
        ],

        "6.  Other   (please specify)(6)": [],
        "Memo items:(7)": [],
        "International bunkers": [
            "Aviation"
            "Navigation"
        ],

        "Multilateral operations": [], 
        "CO2 emissions from biomass":[], 
        "CO2 captured":[],
        "Long-term storage of C in waste disposal sites":[], 
        "Indirect N2O":[],
        "Indirect CO2":[],
        
    }
}


In [None]:
def process_summary_sheet(sheet_name, folder_path, output_folder):
    """
    Process a specific summary sheet from all Excel files in the folder
    """
    print(f"Processing {sheet_name}...")
    
    for filepath in glob.glob(os.path.join(folder_path, "*.xlsx")):
        try:
            # Read the specific summary sheet
            df = read_excel_with_detected_header(filepath, sheet_name)
            
            # Standardise column names
            new_columns = []
            for col in df.columns:
                # Standardise CO2 column names
                if col == 'CO2' or col == 'CO2 (kt)' or col == 'Net CO2 (kt)':
                    new_columns.append('Net CO2 emissions/removals (kt)')
                #other gases just in case
                elif col == 'CH4' or col == 'CH4 (kt)':
                    new_columns.append('CH4 (kt)')
                elif col == 'N2O' or col == 'N2O (kt)':
                    new_columns.append('N2O (kt)')
                else:
                    new_columns.append(col)
            df.columns = new_columns

            # Find the category column
            category_col = None
            for col in df.columns:
                if 'GREENHOUSE GAS SOURCE AND SINK CATEGORIES' in col:
                    category_col = col
                    break
            
            if not category_col:
                raise ValueError("Category column not found")

            # Keep the category column
            categories = df[category_col]

            # Convert other columns to numeric, coercing errors to NaN
            numeric_df = df.drop(columns=[category_col]).apply(pd.to_numeric, errors='coerce')

            # Recombine the category column back
            df = pd.concat([categories, numeric_df], axis=1)

            # Drop rows that are completely empty but keep the category column
            df = df.dropna(axis=0, how='all', subset=numeric_df.columns)
            
            # Drop columns that are completely empty
            df = df.dropna(axis=1, how='all')

            # Define years and GHG keywords
            years_table10s6 = [str(year) for year in range(1990, 2023+1)]
            ghg_keywords = ['CO2', 'CH4', 'N2O', 'SF6', 'HFC', 'PFC',
                          'Base year (1)', 'Change from base to latest'] + years_table10s6

            # Keep category column containing any GHG keywords
            cols_to_keep = [category_col] + [col for col in df.columns 
                                           if any(k in col for k in ghg_keywords)]

            df = df[cols_to_keep]
            
            # Extract year
            country_code, year = extract_year_from_filename(os.path.basename(filepath))
            if not country_code or not year:
                print(f"Skipping {filepath}: Could not extract country code or year")
                continue
            
            # Add sheet name to DataFrame
            df['Sheet'] = sheet_name

            # create  directories
            country_output = os.path.join(output_folder, country_code.lower())
            os.makedirs(os.path.join(country_output, 'total'), exist_ok=True)
            os.makedirs(os.path.join(country_output, 'sectors'), exist_ok=True)
            os.makedirs(os.path.join(country_output, 'subsectors'), exist_ok=True)

            df['Country'] = country_code.upper()
            df['Year'] = year
            
            # process into hierarchical structure
            total_df, sector_df, subsector_dfs = process_hierarchical_data(df, categories_dict)
            
            # create filename with sheet name and year
            base_filename = f"{country_code.lower()}_{sheet_name}_{year}"
            
            # save files with easier names
            if total_df is not None:
                total_df.to_csv(
                    os.path.join(country_output, 'total', f"{base_filename}_total.csv"), 
                    index=False
                )
            
            if sector_df is not None:
                sector_df.to_csv(
                    os.path.join(country_output, 'sectors', f"{base_filename}_sectors.csv"), 
                    index=False
                )
            
            for sector, subsector_df in subsector_dfs.items():
                if subsector_df is not None:
                    safe_sector_name = sector.replace(".", "").replace(" ", "_")
                    subsector_df.to_csv(
                        os.path.join(country_output, 'subsectors', 
                                f"{base_filename}_{safe_sector_name}_subsectors.csv"),
                        index=False
                    )

            print(f" processed {sheet_name} for year {year}")
            
        except Exception as e:
            print(f"Error processing {sheet_name} in {filepath}: {e}")

# Main execution
folder_path = "data/gbr"
output_folder = "data/processed_data"





In [None]:
# Main execution
folder_path = "data/gbr"
output_folder = "data/processed_data"
summary_sheets = [s for s in data.sheet_names if s.startswith("Summary1")]
summary1_as1 = process_summary_sheet("Summary1.As1", folder_path, output_folder)

In [7]:
summary1_as2 = process_summary_sheet("Summary1.As2", folder_path, output_folder)


Processing Summary1.As2...

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 16 subsector rows

Detailed breakdown:

Sectors:
['3.  Agriculture', '4.  Land use, land-use change and forestry  (4)', '5.  Waste']

Subsectors for 3.  Agriculture:
['A.  Enteric fermentation', 'B.  Manure management', 'D.  Agricultural soils', 'F.  Field burning of agricultural residues', 'G. Liming', 'H. Urea application', 'J.  Other ', 'A. Forest land (4)', 'B. Cropland (4)']

Subsectors for 4.  Land use, land-use change and forestry  (4):
['D. Wetlands (4)', 'E. Settlements (4)', 'G. Harvested wood products ', 'H. Other  (4)', 'A.  Solid waste disposal (5)', 'B.  Biological treatment of solid waste (5)', 'C.  Incineration and open burning of waste (5)']
Successfully processed Summary1.As2 for year 1990

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 16 subsector rows

Detailed breakdown:

Sectors:
['3.  Agriculture', '4.  Land use, land-use change and forestry  (4)', '5. 

In [8]:
summary1_as3 = process_summary_sheet("Summary1.As3", folder_path, output_folder)


Processing Summary1.As3...

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 1 subsector rows

Detailed breakdown:

Sectors:
['International bunkers', 'CO2 emissions from biomass', 'Indirect N2O']

Subsectors for International bunkers:
['Navigation']
Successfully processed Summary1.As3 for year 1990

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 1 subsector rows

Detailed breakdown:

Sectors:
['International bunkers', 'CO2 emissions from biomass', 'Indirect N2O']

Subsectors for International bunkers:
['Navigation']
Successfully processed Summary1.As3 for year 1991

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 1 subsector rows

Detailed breakdown:

Sectors:
['International bunkers', 'CO2 emissions from biomass', 'Indirect N2O']

Subsectors for International bunkers:
['Navigation']
Successfully processed Summary1.As3 for year 1992

Processing Summary:
Found 0 total rows
Found 3 sector rows
Found 1 subsector rows

Detailed breakdown:

In [12]:
def combine_and_save_data(country_code='gbr'):
    """
    Combines yearly CSV files and saves them as combined CSV files
    """
    base_path = f"data/processed_data/{country_code}"
    output_path = f"data/processed_data/{country_code}/combined"
    os.makedirs(output_path, exist_ok=True)

    # Simple loop through each category
    for category in ['total', 'sectors', 'subsectors']:
        print(f"\nProcessing {category}...")
        
        # Get all CSV files for this category
        files = glob.glob(os.path.join(base_path, category, '*.csv'))
        
        if files:
            # Read and combine all files
            dfs = []
            for file in files:
                print(f"Reading {file}")
                df = pd.read_csv(file)
                dfs.append(df)
            
            # Combine all dataframes
            combined = pd.concat(dfs, ignore_index=True)
            
            # Save combined file
            output_file = os.path.join(output_path, f'{country_code}_{category}_combined.csv')
            combined.to_csv(output_file, index=False)
            print(f"Saved combined {category} data to {output_file}")

# Run it
combine_and_save_data('gbr')


Processing total...
Reading data/processed_data/gbr\total\gbr_Summary1.As1_1990_total.csv
Reading data/processed_data/gbr\total\gbr_Summary1.As1_1991_total.csv
Reading data/processed_data/gbr\total\gbr_Summary1.As1_1992_total.csv
Reading data/processed_data/gbr\total\gbr_Summary1.As1_1993_total.csv
Reading data/processed_data/gbr\total\gbr_Summary1.As1_1994_total.csv
Reading data/processed_data/gbr\total\gbr_Summary1.As1_1995_total.csv
Reading data/processed_data/gbr\total\gbr_Summary1.As1_1996_total.csv
Reading data/processed_data/gbr\total\gbr_Summary1.As1_1997_total.csv
Reading data/processed_data/gbr\total\gbr_Summary1.As1_1998_total.csv
Reading data/processed_data/gbr\total\gbr_Summary1.As1_1999_total.csv
Reading data/processed_data/gbr\total\gbr_Summary1.As1_2000_total.csv
Reading data/processed_data/gbr\total\gbr_Summary1.As1_2001_total.csv
Reading data/processed_data/gbr\total\gbr_Summary1.As1_2002_total.csv
Reading data/processed_data/gbr\total\gbr_Summary1.As1_2003_total.csv

In [14]:
#test diff countries
# Main execution
aut_path = "data/aut"
output_folder = "data/processed_data"
summary1_as1 = process_summary_sheet("Summary1.As1", aut_path, output_folder)
summary1_as2 = process_summary_sheet("Summary1.As2", aut_path, output_folder)
summary1_as3 = process_summary_sheet("Summary1.As3", aut_path, output_folder)

Processing Summary1.As1...

Processing Summary:
Found 1 total rows
Found 2 sector rows
Found 16 subsector rows

Detailed breakdown:

Total categories:
['Total national emissions and removals']

Sectors:
['1. Energy', '2.  Industrial processes and product use']

Subsectors for 1. Energy:
['A. Fuel combustion     Reference approach(2)', '                                           Sectoral approach(2)', '1.  Energy industries', '2.  Manufacturing industries and construction                          ', '3.  Transport', '4.  Other sectors', '5.  Other', 'B. Fugitive emissions from fuels', '1.  Solid fuels', '2.  Oil and natural gas and other emissions from energy production']

Subsectors for 2.  Industrial processes and product use:
['B.  Chemical industry', 'C.  Metal industry', 'D.  Non-energy products from fuels and solvent use ', 'E.  Electronic industry ', 'G.  Other product manufacture and use ', 'H.  Other(3)']
Successfully processed Summary1.As1 for year 1990

Processing Summary:
Fo

In [15]:
combine_and_save_data('aut')


Processing total...
Reading data/processed_data/aut\total\aut_Summary1.As1_1990_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1991_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1992_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1993_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1994_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1995_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1996_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1997_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1998_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_1999_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_2000_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_2001_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_2002_total.csv
Reading data/processed_data/aut\total\aut_Summary1.As1_2003_total.csv

In [16]:
#test diff countries
# Main execution
blr_path = "data/blr"
output_folder = "data/processed_data"
blr_summary1_as1 = process_summary_sheet("Summary1.As1", blr_path, output_folder)
blr_summary1_as2 = process_summary_sheet("Summary1.As2", blr_path, output_folder)
blr_summary1_as3 = process_summary_sheet("Summary1.As3", blr_path, output_folder)

Processing Summary1.As1...

Processing Summary:
Found 1 total rows
Found 2 sector rows
Found 14 subsector rows

Detailed breakdown:

Total categories:
['Total national emissions and removals']

Sectors:
['1. Energy', '2.  Industrial processes and product use']

Subsectors for 1. Energy:
['A. Fuel combustion     Reference approach(2)', '                                           Sectoral approach(2)', '1.  Energy industries', '2.  Manufacturing industries and construction                          ', '3.  Transport', '4.  Other sectors', '5.  Other', 'B. Fugitive emissions from fuels', '2.  Oil and natural gas and other emissions from energy production', 'A.  Mineral industry']

Subsectors for 2.  Industrial processes and product use:
['C.  Metal industry', 'D.  Non-energy products from fuels and solvent use ', 'G.  Other product manufacture and use ', 'H.  Other(3)']
Successfully processed Summary1.As1 for year 1990

Processing Summary:
Found 1 total rows
Found 2 sector rows
Found 14 su

In [17]:
combine_and_save_data('blr')


Processing total...
Reading data/processed_data/blr\total\blr_Summary1.As1_1990_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1991_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1992_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1993_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1994_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1995_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1996_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1997_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1998_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_1999_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_2000_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_2001_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_2002_total.csv
Reading data/processed_data/blr\total\blr_Summary1.As1_2003_total.csv

In [18]:
#test diff countries
# Main execution
can_path = "data/can"
output_folder = "data/processed_data"
can_summary1_as1 = process_summary_sheet("Summary1.As1", can_path, output_folder)
can_summary1_as2 = process_summary_sheet("Summary1.As2", can_path, output_folder)
can_summary1_as3 = process_summary_sheet("Summary1.As3", can_path, output_folder)

Processing Summary1.As1...

Processing Summary:
Found 1 total rows
Found 2 sector rows
Found 15 subsector rows

Detailed breakdown:

Total categories:
['Total national emissions and removals']

Sectors:
['1. Energy', '2.  Industrial processes and product use']

Subsectors for 1. Energy:
['A. Fuel combustion     Reference approach(2)', '                                           Sectoral approach(2)', '1.  Energy industries', '2.  Manufacturing industries and construction                          ', '3.  Transport', '4.  Other sectors', '5.  Other', 'B. Fugitive emissions from fuels', '1.  Solid fuels', '2.  Oil and natural gas and other emissions from energy production']

Subsectors for 2.  Industrial processes and product use:
['B.  Chemical industry', 'C.  Metal industry', 'D.  Non-energy products from fuels and solvent use ', 'E.  Electronic industry ', 'G.  Other product manufacture and use ']
Successfully processed Summary1.As1 for year 1990

Processing Summary:
Found 1 total rows

In [19]:
combine_and_save_data('can')


Processing total...
Reading data/processed_data/can\total\can_Summary1.As1_1990_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1991_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1992_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1993_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1994_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1995_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1996_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1997_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1998_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_1999_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_2000_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_2001_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_2002_total.csv
Reading data/processed_data/can\total\can_Summary1.As1_2003_total.csv

In [20]:
#test diff countries
# Main execution
est_path = "data/est"
output_folder = "data/processed_data"
est_summary1_as1 = process_summary_sheet("Summary1.As1", est_path, output_folder)
est_summary1_as2 = process_summary_sheet("Summary1.As2", est_path, output_folder)
est_summary1_as3 = process_summary_sheet("Summary1.As3", est_path, output_folder)

Processing Summary1.As1...

Processing Summary:
Found 1 total rows
Found 2 sector rows
Found 13 subsector rows

Detailed breakdown:

Total categories:
['Total national emissions and removals']

Sectors:
['1. Energy', '2.  Industrial processes and product use']

Subsectors for 1. Energy:
['A. Fuel combustion     Reference approach(2)', '                                           Sectoral approach(2)', '1.  Energy industries', '2.  Manufacturing industries and construction                          ', '3.  Transport', '4.  Other sectors', 'B. Fugitive emissions from fuels', '2.  Oil and natural gas and other emissions from energy production', 'A.  Mineral industry', 'B.  Chemical industry']

Subsectors for 2.  Industrial processes and product use:
['D.  Non-energy products from fuels and solvent use ', 'G.  Other product manufacture and use ', 'H.  Other(3)']
Successfully processed Summary1.As1 for year 1990

Processing Summary:
Found 1 total rows
Found 2 sector rows
Found 13 subsector ro

In [21]:
combine_and_save_data('est')


Processing total...
Reading data/processed_data/est\total\est_Summary1.As1_1990_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1991_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1992_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1993_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1994_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1995_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1996_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1997_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1998_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_1999_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_2000_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_2001_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_2002_total.csv
Reading data/processed_data/est\total\est_Summary1.As1_2003_total.csv

In [22]:
#test diff countries
# Main execution
usa_path = "data/usa"
output_folder = "data/processed_data"
usa_summary1_as1 = process_summary_sheet("Summary1.As1", usa_path, output_folder)
usa_summary1_as2 = process_summary_sheet("Summary1.As2", usa_path, output_folder)
usa_summary1_as3 = process_summary_sheet("Summary1.As3", usa_path, output_folder)

Processing Summary1.As1...

Processing Summary:
Found 1 total rows
Found 2 sector rows
Found 16 subsector rows

Detailed breakdown:

Total categories:
['Total national emissions and removals']

Sectors:
['1. Energy', '2.  Industrial processes and product use']

Subsectors for 1. Energy:
['A. Fuel combustion     Reference approach(2)', '                                           Sectoral approach(2)', '1.  Energy industries', '2.  Manufacturing industries and construction                          ', '3.  Transport', '4.  Other sectors', '5.  Other', 'B. Fugitive emissions from fuels', '1.  Solid fuels', '2.  Oil and natural gas and other emissions from energy production']

Subsectors for 2.  Industrial processes and product use:
['B.  Chemical industry', 'C.  Metal industry', 'E.  Electronic industry ', 'F.  Product uses as substitutes for ODS', 'G.  Other product manufacture and use ', 'H.  Other(3)']
Successfully processed Summary1.As1 for year 1990

Processing Summary:
Found 1 total 

In [23]:
combine_and_save_data('usa')


Processing total...
Reading data/processed_data/usa\total\usa_Summary1.As1_1990_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1991_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1992_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1993_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1994_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1995_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1996_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1997_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1998_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_1999_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_2000_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_2001_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_2002_total.csv
Reading data/processed_data/usa\total\usa_Summary1.As1_2003_total.csv