In [5]:
import pandas as pd
from pathlib import Path

# Folder containing the 5 NGFS Phase-5 Excel files
# Example structure (adjust if your folder name is different):
# Datasets/NGFS-Phase-5/
ngfs_folder = Path("Datasets") / "NGFS-Phase-5"

ngfs_files = sorted(list(ngfs_folder.glob("*.xlsx")))
print(f"Found {len(ngfs_files)} NGFS files:")
for f in ngfs_files:
    print(" -", f.name)

Found 5 NGFS files:
 - Downscaled_GCAM 6.0 NGFS_data.xlsx
 - Downscaled_MESSAGEix-GLOBIOM 2.0-M-R12-NGFS_data.xlsx
 - Downscaled_REMIND-MAgPIE 3.3-4.8_data.xlsx
 - IAM_data.xlsx
 - NiGEM_data.xlsx


In [6]:
def melt_ngfs_years(df: pd.DataFrame) -> pd.DataFrame:
    """Convert wide year columns (e.g. 2020, 2021, ...) into long format.

    Expects columns: Model, Scenario, Region, Variable, Unit, <years...>
    """
    # Treat any purely numeric column name as a year
    year_cols = [c for c in df.columns if str(c).isdigit()]
    id_cols = ["Model", "Scenario", "Region", "Variable", "Unit"]

    long_df = df.melt(
        id_vars=id_cols,
        value_vars=year_cols,
        var_name="Year",
        value_name="Value",
    )
    long_df["Year"] = long_df["Year"].astype(int)
    return long_df

In [7]:
def clean_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns.astype(str)
        .str.strip()
        .str.replace("\n", " ")
        .str.replace("  ", " ")
    )
    return df

In [8]:
ngfs_frames = []
for f in ngfs_files:
    print(f"Reading: {f.name}")
    df_raw = pd.read_excel(f)
    df_raw = clean_cols(df_raw)

    # Add metadata for provider & dataset
    df_raw["Provider"] = "NGFS-Phase-5"

    # df_long = melt_ngfs_years(df_raw)
    ngfs_frames.append(df_raw)

ngfs_data = pd.concat(ngfs_frames, ignore_index=True)
ngfs_data.head()

Reading: Downscaled_GCAM 6.0 NGFS_data.xlsx
Reading: Downscaled_MESSAGEix-GLOBIOM 2.0-M-R12-NGFS_data.xlsx
Reading: Downscaled_REMIND-MAgPIE 3.3-4.8_data.xlsx
Reading: IAM_data.xlsx
Reading: NiGEM_data.xlsx


Unnamed: 0,Model,Scenario,Region,Variable,Unit,2020,2021,2022,2023,2024,...,2092,2093,2094,2095,2096,2097,2098,2099,2100,Provider
0,Downscaling[GCAM 6.0 NGFS],Delayed transition,AGO,Carbon Sequestration|CCS,Mt CO2/yr,0.0,,,,,...,,,,,,,,,,NGFS-Phase-5
1,Downscaling[GCAM 6.0 NGFS],Delayed transition,AGO,Carbon Sequestration|CCS|Biomass,Mt CO2/yr,0.0,,,,,...,,,,,,,,,,NGFS-Phase-5
2,Downscaling[GCAM 6.0 NGFS],Delayed transition,AGO,Carbon Sequestration|CCS|Fossil,Mt CO2/yr,0.0,,,,,...,,,,,,,,,,NGFS-Phase-5
3,Downscaling[GCAM 6.0 NGFS],Delayed transition,AGO,Carbon Sequestration|CCS|Industrial Processes,Mt CO2/yr,0.0,,,,,...,,,,,,,,,,NGFS-Phase-5
4,Downscaling[GCAM 6.0 NGFS],Delayed transition,AGO,Emissions|CO2,Mt CO2/yr,148.1421,,,,,...,,,,,,,,,,NGFS-Phase-5


In [17]:
# Export helper lists for NGFS variables and scenarios (similar style as AR6 section)
ngfs_variables = pd.DataFrame(sorted(ngfs_data["Variable"].unique()))
ngfs_variables.columns = ["Variable"]
ngfs_variables.to_csv("Datasets/processed/NGFS-Phase-5/Variables_list.csv", index=False)

In [18]:
ngfs_scenarios = pd.DataFrame(sorted(ngfs_data["Scenario"].unique()))
ngfs_scenarios.columns = ["Scenario"]
ngfs_scenarios.to_csv("Datasets/processed/NGFS-Phase-5/Scenarios_list.csv", index=False)

In [19]:
ngfs_data.shape

(1281478, 87)

In [20]:
# Save full long-format NGFS Phase-5 extract
# Path mirrors the AR6 export style used above.
output_path = Path("Datasets/processed/NGFS-Phase-5/Merged_NGFS_Phase-5_long.csv")
ngfs_data.to_csv(output_path, index=False)
output_path

WindowsPath('Datasets/processed/NGFS-Phase-5/Extracted_NGFS_Phase-5_long.csv')

In [28]:
# Extract unique NGFS variables
ngfs_vars = ngfs_data["Variable"].dropna().unique()

# Convert to Series for filtering
ngfs_series = pd.Series(ngfs_vars)

# Category filters
ch4_vars     = ngfs_series[ngfs_series.str.contains("CH4", case=False, na=False)]
co2_vars     = ngfs_series[ngfs_series.str.contains("CO2", case=False, na=False)]
n2o_vars     = ngfs_series[ngfs_series.str.contains("N2O", case=False, na=False)]
aerosol_vars = ngfs_series[ngfs_series.str.contains("Aerosol|Aerosols|SO2|sulf", case=False, na=False)]

# Select top 20 each
top20_ch4     = ch4_vars.head(20).tolist()
top20_co2     = co2_vars.head(20).tolist()
top20_n2o     = n2o_vars.head(20).tolist()
top20_aerosol = aerosol_vars.head(20).tolist()

top20_ch4, top20_co2, top20_n2o, top20_aerosol

(['Emissions|CH4',
  'Emissions|CH4|AFOLU',
  'Emissions|CH4|Energy',
  'Emissions|CH4|Energy|Demand|Industry',
  'Emissions|CH4|Energy|Demand|Residential and Commercial',
  'Emissions|CH4|Energy|Demand|Transportation',
  'Emissions|CH4|Energy|Supply',
  'Emissions|CH4|Other',
  'AR6 climate diagnostics|Effective Radiative Forcing|CH4|MAGICCv7.5.3|50.0th Percentile',
  'AR6 climate diagnostics|Harmonized|Emissions|CH4',
  'AR6 climate diagnostics|Infilled|Emissions|CH4',
  'Emissions|CH4|Waste',
  'Emissions|CH4|Energy|Supply|Gases|Extraction',
  'Emissions|CH4|Energy|Supply|Liquids|Extraction',
  'Emissions|CH4|Energy|Supply|Solids|Extraction'],
 ['Emissions|CO2',
  'Emissions|CO2|Energy',
  'Emissions|CO2|Energy|Demand|Industry',
  'Emissions|CO2|Energy|Demand|Residential and Commercial',
  'Emissions|CO2|Energy|Demand|Transportation',
  'Emissions|CO2|Industrial Processes',
  'Emissions|CO2|LULUCF Direct+Indirect',
  'Emissions|CO2|LULUCF Indirect',
  'Emissions|Total Non-CO2',
  'S

In [29]:
co2_data=ngfs_data[ngfs_data["Variable"].isin(top20_co2)]
ch4_data=ngfs_data[ngfs_data["Variable"].isin(top20_ch4)]
n2o_data=ngfs_data[ngfs_data["Variable"].isin(top20_n2o)]
aerosols_data=ngfs_data[ngfs_data["Variable"].isin(top20_aerosol)]

In [38]:
ch4_data.head()

Unnamed: 0,Model,Scenario,Region,Variable,Unit,2020,2021,2022,2023,2024,...,2092,2093,2094,2095,2096,2097,2098,2099,2100,Provider
776152,GCAM 6.0 NGFS,Delayed transition,Asia (R5),Emissions|CH4,Mt CH4/yr,137.70151,,,,,...,,,,56.970154,,,,,54.666463,NGFS-Phase-5
776153,GCAM 6.0 NGFS,Delayed transition,Asia (R5),Emissions|CH4|AFOLU,Mt CH4/yr,67.558401,,,,,...,,,,42.53125,,,,,41.156466,NGFS-Phase-5
776154,GCAM 6.0 NGFS,Delayed transition,Asia (R5),Emissions|CH4|Energy,Mt CH4/yr,39.16487,,,,,...,,,,2.065091,,,,,1.955014,NGFS-Phase-5
776155,GCAM 6.0 NGFS,Delayed transition,Asia (R5),Emissions|CH4|Energy|Demand|Industry,Mt CH4/yr,0.627787,,,,,...,,,,0.028091,,,,,0.031641,NGFS-Phase-5
776156,GCAM 6.0 NGFS,Delayed transition,Asia (R5),Emissions|CH4|Energy|Demand|Residential and Co...,Mt CH4/yr,4.895683,,,,,...,,,,0.154129,,,,,0.162463,NGFS-Phase-5


In [57]:
final_data=pd.concat([co2_data,ch4_data,n2o_data,aerosols_data],axis=0)
# CLEANING
final_data = final_data.replace("", pd.NA)       # empty → NaN
final_data = final_data.dropna(axis=1, how="all") # remove empty cols

# OPTIONAL: remove columns of ALL zeros
# final_data = final_data.loc[:, (final_data != 0).any(axis=0)]

final_data = final_data.reset_index(drop=True)

In [58]:
final_data.head()

Unnamed: 0,Model,Scenario,Region,Variable,Unit,2020,2025,2030,2035,2040,...,2060,2065,2070,2075,2080,2085,2090,2095,2100,Provider
0,Downscaling[GCAM 6.0 NGFS],Delayed transition,AGO,Emissions|CO2,Mt CO2/yr,148.1421,90.3493,129.774,156.4094,184.2701,...,,,,,,,,,,NGFS-Phase-5
1,Downscaling[GCAM 6.0 NGFS],Delayed transition,AGO,Emissions|CO2|Energy,Mt CO2/yr,16.0474,17.1107,18.8339,19.8972,22.25,...,,,,,,,,,,NGFS-Phase-5
2,Downscaling[GCAM 6.0 NGFS],Delayed transition,AGO,Emissions|CO2|Energy|Demand|Industry,Mt CO2/yr,5.6282,6.6025,7.5784,8.2838,9.7327,...,,,,,,,,,,NGFS-Phase-5
3,Downscaling[GCAM 6.0 NGFS],Delayed transition,AGO,Emissions|CO2|Energy|Demand|Residential and Co...,Mt CO2/yr,4.8467,4.9683,4.5109,3.7087,3.3818,...,,,,,,,,,,NGFS-Phase-5
4,Downscaling[GCAM 6.0 NGFS],Delayed transition,AGO,Emissions|CO2|Energy|Demand|Transportation,Mt CO2/yr,11.504,11.169,11.8655,10.9031,10.3385,...,,,,,,,,,,NGFS-Phase-5


In [59]:
scenario_list = final_data["Scenario"].unique()
scenario_list

array(['Delayed transition', 'Fragmented World', 'Current Policies',
       'Nationally Determined Contributions (NDCs)', 'Net Zero 2050',
       'Below 2°C', 'Low demand'], dtype=object)

In [60]:
variable_list = final_data["Variable"].unique()
variable_list

array(['Emissions|CO2', 'Emissions|CO2|Energy',
       'Emissions|CO2|Energy|Demand|Industry',
       'Emissions|CO2|Energy|Demand|Residential and Commercial',
       'Emissions|CO2|Energy|Demand|Transportation',
       'Emissions|CO2|Industrial Processes',
       'Emissions|CO2|LULUCF Direct+Indirect',
       'Emissions|CO2|LULUCF Indirect', 'Emissions|Total Non-CO2',
       'Statistical Difference|Emissions|CO2',
       'Statistical Difference|Emissions|CO2|Energy',
       'Statistical Difference|Emissions|CO2|Industrial Processes',
       'Statistical Difference|Emissions|CO2|LULUCF Direct+Indirect',
       'Statistical Difference|Emissions|Total Non-CO2',
       'Emissions|CO2|Energy|Supply|Other Sector',
       'Emissions|CO2|Energy|Supply|Solids',
       'Investment|Energy Supply|CO2 Transport and Storage',
       'Emissions|CO2|AFOLU',
       'Emissions|CO2|Energy and Industrial Processes',
       'Emissions|CO2|Energy|Demand', 'Emissions|CH4',
       'Emissions|CH4|AFOLU', 'Emi

In [63]:
final_data.to_csv("Datasets/processed/NGFS-Phase-5/Extracted_NGFS_Phase-5.csv",index=False)