In [1]:
pip install -rq requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'q'


In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
# import qgrid

In [6]:
df_1 = pd.read_csv('emissions_profile_data.csv')
df_2 = pd.read_csv('historic_emissions_data.csv')
df_1.shape, df_2.shape

((41, 23), (120, 11))

In [None]:
"""
This is a description of the steps to be taken to clean and transform the data beforem

1. Combine emissions data with the historic data to work on one dataframe
1.5 Rename certain columns to make them easier to work with

Data Cleaning Steps:

2. Replace all terms that represent missing data with NaN
3. Check all year columns are 4 numbers long, if not replace with 0 for now
4. Check all emissions columns are numbers and positive

Data Transformation Steps:

5. Replace Net Zero target year with 2050 if it is not present
6. Replace baseline year of emmissons with most recent year of emmissions if it is not present
7. Create a new columns called Net Zero target == 0.99*Baseline year emmissions for each scope
8. Create new columns called scope 1, scope 2 and scope 3 emissions on the interim year == (1-interim target)*baseline year emissions

9. Remove rows where there is no baseline emmission or target information for any of the scopes


From here we can carry out each question in the assignment

a) Design a data pipeline to cleanse the raw data and produce a forecast for yearly
carbon emissions from Baseline year to 2050. The forecast should follow a linear
yearly decrease between baseline emissions and both interim target years, and
ultimate net zero year. The forecast should be produced for each firm for each scope,
as displayed in example output (A).

"""

In [7]:
""" 1. Combine emissions data with the historic data to work on one dataframe """

# Aggregate the data by verdantix_id
df_2_agg = df_2.groupby('verdantix_id').agg(list).reset_index() 

# Expand the columns with lists into multiple columns
expanded_columns = {}
for col in df_2_agg.columns:
    if col != 'verdantix_id': 
        max_len = df_2_agg[col].str.len().max() # Number of values in each columns list after aggregating ; should be 3
        col_names = [f"{col}_{i+1}" for i in range(max_len)] 
        expanded_df = pd.DataFrame(df_2_agg[col].tolist(), columns=col_names)
        expanded_columns[col] = expanded_df

# Combine expanded columns into one DataFrame
expanded_data = pd.concat([df_2_agg[['verdantix_id']]] + list(expanded_columns.values()), axis=1)

# Step 3: Merge the expanded data with first_df
merged_df = pd.merge(df_1, expanded_data, on='verdantix_id', how='left')

print('Stage 1 - Merging the two dataframes - Done')

# Rename columns to make them easier to work with
merged_df.rename(columns={'Scope 2  Emissions': 'Scope 2 Emissions'}, inplace=True)

merged_df['Scope 3 Emissions Year'] = merged_df['Scope 1 Emissions Year']

# # Display the merged DataFrame
print(merged_df.columns)

Stage 1 - Merging the two dataframes - Done
Index(['verdantix_id', 'Scope 1 Emissions Year', 'Scope 1 Emissions',
       'Scope 2 Emissions Year', 'Scope 2 - Location', 'Scope 2 ',
       'Scope 2 - market', 'Scope 3', 'Baseline Year (Scope 1)',
       'Base Year Emissions - Scope 1', 'Baseline Year (Scope 2)',
       'Base Year Emissions - Scope 2', 'Baseline Year (Scope 3)',
       'Base Year Emissions - Scope 3', 'NZ Target Year: Scope 1',
       'NZ Target Year: Scope 2', 'NZ Target Year: Scope 3',
       'Interim Target Year 1: Scope 1', 'Interim Target Year 2: Scope 2',
       'Interim Target Year 2: Scope 3', 'Interim Target % 1: Scope 1',
       'Interim Target % 1: Scope 2', 'Interim Target % 1: Scope 3',
       'lookup_1', 'lookup_2', 'lookup_3', 'Scope _1', 'Scope _2', 'Scope _3',
       'Actual/Projection_1', 'Actual/Projection_2', 'Actual/Projection_3',
       '2016_1', '2016_2', '2016_3', '2017_1', '2017_2', '2017_3', '2018_1',
       '2018_2', '2018_3', '2019_1', '2019_2

In [8]:
""" 1.5 Rename certain columns to make them easier to work with """

column_name_change_list = [
                           'Scope 2 - Location',
                           'Scope 2 ',
                           'Scope 2 - market', 
                           'Scope 3',
                           '2016_1', 
                           '2016_2', 
                           '2016_3', 
                           '2017_1', 
                           '2017_2', 
                           '2017_3', 
                           '2018_1',
                           '2018_2', 
                           '2018_3',
                           '2019_1',
                           '2019_2', 
                           '2019_3', 
                           '2020_1', 
                           '2020_2',
                           '2020_3', 
                           '2021_1', 
                           '2021_2', 
                           '2021_3', 
                           '2022_1', 
                           '2022_2', 
                           '2022_3'
]


for col in column_name_change_list:
    if col in merged_df.columns:
        new_col_name = col + ' Emissions'
        merged_df.rename(columns={col: new_col_name}, inplace=True)

In [9]:
""" 2. Replace all terms that represent missing data with NaN """

def replace_values(df, old_value, new_value):

    df = df.applymap(lambda x: new_value if isinstance(x, str) and (x.lower() == old_value.lower() or old_value.lower() in x.lower()) else x)
    return df

# Replace all occurrences of 'Not stated', 'Not disclosed', and 'None' with None in df_1
merged_df = replace_values(merged_df, 'Not stated', None)
merged_df = replace_values(merged_df, 'Not disclosed', None)
merged_df = replace_values(merged_df, 'None', None)
#print(merged_df)

print('Stage 2 - Missing values replaced with NaN - Done')

Stage 2 - Missing values replaced with NaN - Done


In [175]:

# test_1=merged_df[year_columns]

# for column_name in test_1.columns:
#     unique_values = test_1[column_name].unique()
#     print(f"Unique values in {column_name}: {unique_values}")

In [324]:
""" 3. Check all year columns are 4 numbers long, if not change to -1 for now """

# Get all columns with 'year' in their name
year_columns = [col for col in merged_df.columns if 'year' in col.lower()]
# Remove columns with 'emissions' in their name if after the word 'year'
year_columns = [col for col in year_columns if 'emissions' not in col.lower() or 'year' in col.lower().split('emissions')[1]]

# Convert year columns to object type
merged_df[year_columns] = merged_df[year_columns].fillna(-1).astype(int)
merged_df[year_columns] = merged_df[year_columns].astype(float)
merged_df[year_columns] = merged_df[year_columns].astype(int)

# Add 20 to the front of numbers between 15 and 50
merged_df[year_columns] = merged_df[year_columns].applymap(lambda x: int(f"20{x}") if 15 <= x <= 50 else x)

# Change any year values that are not 4 characters long to 0
merged_df[year_columns] = merged_df[year_columns].applymap(lambda x: -1 if len(str(x)) != 4 else x)

# change the year columns to object type
merged_df[year_columns] = merged_df[year_columns].astype(object)

print('Stage 3 - Year columns are 4 numbers long, if not change to -1 for now - Done')

# # Check if all year columns are 4 numbers long
# invalid_year_counts = merged_df[year_columns].applymap(lambda x: len(str(x)) != 4 if x is not None else False).sum()
# print(invalid_year_counts)


Stage 3 - Year columns are 4 numbers long, if not change to -1 for now - Done


In [325]:
""" 4. Check all emissions columns are numbers and positive, if not change to -1 for now """

# Get all columns with 'year' in their name
emissions_columns = [col for col in merged_df.columns if 'emissions' in col.lower()]

# Remove columns with 'year' in their name if after the word 'emissions'
emissions_columns = [col for col in emissions_columns if 'year' not in col.lower() or 'emissions' in col.lower().split('year')[1]]

# Ensure all values are either positive, 0 or -1
merged_df[emissions_columns] = merged_df[emissions_columns].applymap(lambda x: x if x >= 0 else -1)
merged_df[emissions_columns] = merged_df[emissions_columns].astype(int)

print('Stage 4 - Emissions columns are numbers and positive - Done')
#print(emissions_columns)

Stage 4 - Emissions columns are numbers and positive - Done


In [326]:
""" 5. Replace Net Zero target year with 2050 if it is not present """

nz_target_columns = ['NZ Target Year: Scope 1', 'NZ Target Year: Scope 2', 'NZ Target Year: Scope 3']
merged_df[nz_target_columns] = merged_df[nz_target_columns].replace(-1, 2050)

print('Stage 5 - Net Zero target year replaced with 2050 if not present - Done')


Stage 5 - Net Zero target year replaced with 2050 if not present - Done


In [327]:
# # Unique values in each column of df_1
# for column_name in nz_target_columns:
#     unique_values = merged_df[column_name].unique()
#     print(f"Unique values in {column_name}: {unique_values}")

In [328]:
""" 6. Replace baseline year of emmissons with most recent year of emmissions if it is not present """

# baseline_year_columns = ['Baseline Year (Scope 1)', 'Baseline Year (Scope 2)', 'Baseline Year (Scope 3)']
# year_columns_1 = [
#     '2016_1 Emissions', 
#     '2017_1 Emissions', 
#     '2018_1 Emissions', 
#     '2019_1 Emissions', 
#     '2020_1 Emissions',
#     '2021_1 Emissions', 
#     '2022_1 Emissions', 
# ][::-1]

# year_columns_2 = [
#     '2016_2 Emissions', 
#     '2017_2 Emissions', 
#     '2018_2 Emissions', 
#     '2019_2 Emissions', 
#     '2020_2 Emissions',
#     '2021_2 Emissions', 
#     '2022_2 Emissions', 
# ][::-1]

# year_columns_3 = [
#     '2016_3 Emissions', 
#     '2017_3 Emissions', 
#     '2018_3 Emissions', 
#     '2019_3 Emissions', 
#     '2020_3 Emissions',
#     '2021_3 Emissions', 
#     '2022_3 Emissions', 
# ][::-1]

# for index,row in merged_df.iterrows():
#     if row[baseline_year_columns[0]] == -1:
#         non_negative_values = row[year_columns_1]
#         most_recent_year = non_negative_values[non_negative_values != -1].index
#         print(most_recent_year)
#         if len(most_recent_year) > 0:
#              merged_df.at[index, baseline_year_columns[0]] = row[most_recent_year[0]]

print('Stage 6 - Baseline year of emmissons with most recent year of emmissions if it is not present - Done')


Stage 6 - Baseline year of emmissons with most recent year of emmissions if it is not present - Done


In [330]:
""" 7. Create a new columns called Net Zero target == 0.99*Baseline year emmissions for each scope """

base_year_emissions_columns = ['Base Year Emissions - Scope 1', 'Base Year Emissions - Scope 2', 'Base Year Emissions - Scope 3']
for i, col in enumerate(base_year_emissions_columns):
    #print(merged_df[col]*0.99)
    merged_df[f'Net Zero Reduction Target Emissions - Scope {i+1}'] = 0.99 * merged_df[col]

print('Stage 7 - New columns called Net Zero target == 0.99*Baseline year emmissions for each scope - Done')

Stage 7 - New columns called Net Zero target == 0.99*Baseline year emmissions for each scope - Done


In [331]:
""" 8. Create new columns called scope 1, scope 2 and scope 3 emissions on the interim year == (1-interim target)*baseline year emissions """

# Fix names of columns
merged_df.rename(columns={
    'Interim Target Year 2: Scope 2': 'Interim Target Year 1: Scope 2',
    'Interim Target Year 2: Scope 3': 'Interim Target Year 1: Scope 3'
}, inplace=True)

# Correct interim target years if any of them is -1
interim_target_columns = ['Interim Target Year 1: Scope 1', 'Interim Target Year 1: Scope 2', 'Interim Target Year 1: Scope 3']

for col in interim_target_columns:
    other_cols = [c for c in interim_target_columns if c != col]
    merged_df[col] = merged_df.apply(lambda row: next((row[other_col] for other_col in other_cols if row[other_col] != -1), row[col]), axis=1)

# Create new columns

base_year_emissions_columns = ['Base Year Emissions - Scope 1', 'Base Year Emissions - Scope 2', 'Base Year Emissions - Scope 3']

for i, col in enumerate(base_year_emissions_columns):
    interim_target_col = f'Interim Target % 1: Scope {i+1}'
    merged_df[f'Interim Target Emissions - Scope {i+1}'] = (1 - merged_df[interim_target_col]) * merged_df[col]

#print('Stage 8 - New columns called interim target emissions for each scope - Done')

In [332]:
""" 9. Remove rows where there is no baseline emmission or target information for any of the scopes """

baseline_emissions_columns = ['Base Year Emissions - Scope 1',
                              'Base Year Emissions - Scope 2',
                              'Base Year Emissions - Scope 3'
                              ]

target_info_columns = ['Interim Target Year 1: Scope 1',
                       'Interim Target Year 1: Scope 2', 
                       'Interim Target Year 1: Scope 3'
                       ]
# Check each row if all baseline_emissions_columns are equal to -1
for index, row in merged_df.iterrows():
    if all(row[col] == -1 for col in baseline_emissions_columns):
        # If all baseline_emissions_columns are -1, check target_info_columns
        if all(row[col] == -1 for col in target_info_columns):
            # If both baseline_emissions_columns and target_info_columns contain only -1, remove the row
            merged_df.drop(index, inplace=True)

print('Stage 9 - Remove rows where there is no baseline emission or target information for any of the scopes - Done')


Stage 9 - Remove rows where there is no baseline emission or target information for any of the scopes - Done


In [333]:
""" 
Task
a) Design a data pipeline to cleanse the raw data and produce a forecast for yearly
carbon emissions from Baseline year to 2050. The forecast should follow a linear
yearly decrease between baseline emissions and both interim target years, and
ultimate net zero year. The forecast should be produced for each firm for each scope,
as displayed in example output (A).

b) Alongside this forecast data, produce ‘actual data’ to display the distance that each
firm is from their current target.

In our example output we have the following columns: 

-Lookup
-verdantix_id
-Actual/Projected
-Year
-Emmissions(tCO2e)

Steps:
1. Combine columns for each scope into one column
2. Create a new dataframe to store the forecast emissions
3. Create a new dataframe to store the actual emissions ( b )
4. Merge the forecast and actual emissions dataframes


"""

' \nTask\na) Design a data pipeline to cleanse the raw data and produce a forecast for yearly\ncarbon emissions from Baseline year to 2050. The forecast should follow a linear\nyearly decrease between baseline emissions and both interim target years, and\nultimate net zero year. The forecast should be produced for each firm for each scope,\nas displayed in example output (A).\n'

In [559]:
df_task = merged_df.copy()
df_task.drop(columns=['Scope 2 - Location Emissions', 'Scope 2 - market Emissions'], inplace=True)
df_task.rename(columns={'Scope 2  Emissions': 'Scope 2 Emissions'}, inplace=True)
df_task['Scope 3 Emissions Year'] = df_task['Scope 1 Emissions Year']

In [560]:
""" 1. Combine columns for each scope into one column """

# Reduce the dataframe to combining Scope _1, Scope _2, and Scope _3 into a single Scope column
reduced_df_1 = pd.melt(df_task, 
                       id_vars=[col for col in df_task.columns if col not in ['Scope _1', 'Scope _2', 'Scope _3']],
                       value_vars=['Scope _1', 'Scope _2', 'Scope _3'],
                       var_name='Scope', value_name='Scope Value'
                       )

# Replace the Scope column values to 1, 2, 3
reduced_df_1['Scope'] = reduced_df_1['Scope'].map({'Scope _1': 1, 'Scope _2': 2, 'Scope _3': 3})

# Drop the Scope Value column as it is not needed
reduced_df_1.drop(columns=['Scope Value'], inplace=True)


In [561]:
# Combine the 2022 emissions columns into a single column

for year in range(2016, 2023):
    reduced_df_1[f'{year} Emissions'] = reduced_df_1.apply(lambda row: row[f'{year}_{row["Scope"]} Emissions'], axis=1)
    reduced_df_1.drop(columns=[f'{year}_1 Emissions', f'{year}_2 Emissions', f'{year}_3 Emissions'], inplace=True)

#reduced_df_1[reduced_df_1['verdantix_id'] == 'V000001']

In [562]:
def combine_column_on_scope_number(df, new_col_name, old_col_names):

    common_part = old_col_names[0].rsplit('_', 1)[0]

    df[new_col_name] = df.apply(lambda row: row[f'{common_part}_{row["Scope"]}'], axis=1)
    df.drop(columns=old_col_names, inplace=True)
    return df

def combine_column_on_scope(df, new_col_name, old_col_names):

    common_part = old_col_names[0].rsplit('-', 1)[0]

    df[new_col_name] = df.apply(lambda row: row[f'{common_part} - Scope {row["Scope"]}'], axis=1)
    df.drop(columns=old_col_names, inplace=True)
    return common_part

In [563]:
# Combine 'Scope 1 Emissions', 'Scope 2 Emissions', 'Scope 3 Emissions' into 'Emissions'
reduced_df_1['Emissions'] = reduced_df_1.apply(lambda row: row[f'Scope {row["Scope"]} Emissions'], axis=1)

reduced_df_1.drop(columns=['Scope 1 Emissions', 'Scope 2 Emissions', 'Scope 3 Emissions'], inplace=True)


# Combine 'Scope 1 Emissions Year', 'Scope 2 Emissions Year', 'Scope 3 Emissions Year' into 'Emission Year'
reduced_df_1['Emission Year'] = reduced_df_1.apply(lambda row: row[f'Scope {row["Scope"]} Emissions Year'], axis=1)

reduced_df_1.drop(columns=['Scope 1 Emissions Year', 'Scope 2 Emissions Year', 'Scope 3 Emissions Year'], inplace=True)


# Combine 'Baseline Year (Scope 1)', 'Baseline Year (Scope 2)', 'Baseline Year (Scope 3)' into 'Baseline Year'
reduced_df_1['Baseline Year'] = reduced_df_1.apply(lambda row: row[f'Baseline Year (Scope {row["Scope"]})'], axis=1)

reduced_df_1.drop(columns=['Baseline Year (Scope 1)', 'Baseline Year (Scope 2)', 'Baseline Year (Scope 3)'], inplace=True)


# Combine 'Net Zero Reduction Target Emissions - Scope 1', 'Net Zero Reduction Target Emissions - Scope 2', 'Net Zero Reduction Target Emissions - Scope 3' into 'Net Zero Reduction Target Emissions'
reduced_df_1['Net Zero Reduction Target Emissions'] = reduced_df_1.apply(lambda row: row[f'Net Zero Reduction Target Emissions - Scope {row["Scope"]}'], axis=1)

reduced_df_1.drop(columns=[
    'Net Zero Reduction Target Emissions - Scope 1',
    'Net Zero Reduction Target Emissions - Scope 2',
    'Net Zero Reduction Target Emissions - Scope 3'
], inplace=True)


# Combine 'Interim Target Emissions - Scope 1', 'Interim Target Emissions - Scope 2', 'Interim Target Emissions - Scope 3' into 'Interim Target Emissions'
reduced_df_1['Interim Target Emissions'] = reduced_df_1.apply(lambda row: row[f'Interim Target Emissions - Scope {row["Scope"]}'], axis=1)

reduced_df_1.drop(columns=[
    'Interim Target Emissions - Scope 1',
    'Interim Target Emissions - Scope 2',
    'Interim Target Emissions - Scope 3'
], inplace=True)


# Combine 'Interim Target Year 1: Scope 1', 'Interim Target Year 1: Scope 2', 'Interim Target Year 1: Scope 3' into 'Interim Target Year 1'
reduced_df_1['Interim Target % 1'] = reduced_df_1.apply(lambda row: row[f'Interim Target % 1: Scope {row["Scope"]}'], axis=1)

reduced_df_1.drop(columns=[
    'Interim Target % 1: Scope 1',
    'Interim Target % 1: Scope 2',
    'Interim Target % 1: Scope 3'
], inplace=True)


# Combine 'Base Year Emissions - Scope 1', 'Base Year Emissions - Scope 2', 'Base Year Emissions - Scope 3' into 'Base Year Emissions'
reduced_df_1['Base Year Emissions'] = reduced_df_1.apply(lambda row: row[f'Base Year Emissions - Scope {row["Scope"]}'], axis=1)

reduced_df_1.drop(columns=[
    'Base Year Emissions - Scope 1',
    'Base Year Emissions - Scope 2',
    'Base Year Emissions - Scope 3'
], inplace=True)


# Combine 'NZ Target Year: Scope 1', 'NZ Target Year: Scope 2', 'NZ Target Year: Scope 3' into 'NZ Target Year'
reduced_df_1['NZ Target Year'] = reduced_df_1.apply(lambda row: row[f'NZ Target Year: Scope {row["Scope"]}'], axis=1)

reduced_df_1.drop(columns=[
    'NZ Target Year: Scope 1',
    'NZ Target Year: Scope 2',
    'NZ Target Year: Scope 3'
], inplace=True)


# Combine 'Interim Target Year 1: Scope 1', 'Interim Target Year 1: Scope 2', 'Interim Target Year 1: Scope 3' into 'Interim Target Year 1'
reduced_df_1['Interim Target Year 1'] = reduced_df_1.apply(lambda row: row[f'Interim Target Year 1: Scope {row["Scope"]}'], axis=1)

reduced_df_1.drop(columns=[
    'Interim Target Year 1: Scope 1',
    'Interim Target Year 1: Scope 2',
    'Interim Target Year 1: Scope 3'
], inplace=True)


# Combine 'Actual/Projection_1', 'Actual/Projection_2', 'Actual/Projection_3' into 'Actual/Projection'
reduced_df_1 = combine_column_on_scope_number(reduced_df_1, 'Actual/Projection', ['Actual/Projection_1','Actual/Projection_2','Actual/Projection_3'])


# Combine 'lookup_1', 'lookup_2', 'lookup_3' into 'lookup'
reduced_df_1 = combine_column_on_scope_number(reduced_df_1, 'lookup', ['lookup_1', 'lookup_2', 'lookup_3'])


In [564]:
reduced_df_2 = reduced_df_1.copy()
reduced_df_2.drop(columns=['2016 Emissions', '2017 Emissions', '2018 Emissions', '2019 Emissions', '2020 Emissions', '2021 Emissions', '2022 Emissions', 'Actual/Projection', 'Emissions', 'Emission Year'], inplace=True)

In [565]:
""" 2. Create a new dataframe to store the forecast emissions """

# Create a new DataFrame to store the expanded rows
expanded_rows = []

# Loop through each row in the original DataFrame
for _, row in reduced_df_2.iterrows():
    base_year = row['Baseline Year']
    net_zero_year = row['NZ Target Year']
    for year in range(base_year, net_zero_year + 1):
        new_row = row.copy()
        new_row['Year'] = year
        expanded_rows.append(new_row)

# Create the expanded DataFrame
expanded_df = pd.DataFrame(expanded_rows)

In [566]:
# Create the new column 'Emissions (tCO2e)' 
expanded_df['Emissions (tCO2e)'] = expanded_df.apply(
    lambda row: row['Base Year Emissions'] if row['Baseline Year'] == row['Year'] else
                row['Interim Target Emissions'] if row['Interim Target Year 1'] == row['Year'] else
                row['Base Year Emissions'] * 0.01 if row['NZ Target Year'] == row['Year'] else -1,
    axis=1
)

In [567]:
# Deal with the -1 values in the 'Emissions (tCO2e)' column
expanded_df['Emissions (tCO2e)'] = expanded_df['Emissions (tCO2e)'].replace(-1.0, np.nan)

# Interpolate the missing values in the 'Emissions (tCO2e)' column
expanded_df['Emissions (tCO2e)'] = expanded_df['Emissions (tCO2e)'].interpolate(method='linear')

# Create the new column 'Actual/Projection'
expanded_df['Actual/Projection'] = 'Projection'

# Drop columns that are no longer needed
expanded_df.drop(columns=['Net Zero Reduction Target Emissions','Interim Target Emissions',	'Interim Target % 1', 'Base Year Emissions', 'NZ Target Year', 'Interim Target Year 1', 'Baseline Year'], inplace=True)

# Rearrange the columns
expanded_df_projected = expanded_df[['lookup', 'verdantix_id', 'Scope', 'Actual/Projection', 'Year', 'Emissions (tCO2e)']]


In [571]:
""" 3. Create a new dataframe to store the actual emissions """
# Reformat the actual emissions

expanded_df_actual = reduced_df_1.drop(columns=['Net Zero Reduction Target Emissions','Interim Target Emissions', 'Interim Target % 1', 'Base Year Emissions','NZ Target Year', 'Interim Target Year 1','Emissions', 'Emission Year', 'Baseline Year'])
expanded_df_actual = pd.melt(expanded_df_actual, id_vars=['verdantix_id', 'Scope', 'Actual/Projection', 'lookup'], var_name='Year', value_name='Emissions (tCO2e)')

expanded_df_actual['Year'] = expanded_df_actual['Year'].str.extract(r'(\d{4})')

# Remove rows where emissions are -1
expanded_df_actual = expanded_df_actual[expanded_df_actual['Emissions (tCO2e)'] != -1]


# Rearrange the columns
expanded_df_actual = expanded_df_actual[['lookup', 'verdantix_id', 'Scope', 'Actual/Projection', 'Year', 'Emissions (tCO2e)']]

expanded_df_actual['Emissions (tCO2e)'] = expanded_df_actual['Emissions (tCO2e)'].astype(float)

In [573]:
""" 4. Merge the forecast and actual emissions dataframes """
# Combine the projected and actual emissions df

final_df = pd.concat([expanded_df_projected, expanded_df_actual],axis=0)


In [574]:
final_df.to_csv('final_df.csv', index=False)

In [4]:
""" 
Task 

c) Produce an ‘implied overshoot’ forecast which plots the projected net zero year for
corporates whose actual emissions are higher than forecast emissions for the most
recent year.


"""
final_df

NameError: name 'final_df' is not defined