In [119]:
import pandas as pd

# Load the 5 datasets
df1 = pd.read_csv('DBassign1/2016-Cities.csv')
df2 = pd.read_csv('DBassign1/2016-GHG_Emissions.csv')
df3 = pd.read_csv('DBassign1/2017-Community_Emissions.csv')
df4 = pd.read_csv('DBassign1/2017-Cities_Emission_Reduction_Targets.csv')
df5 = pd.read_csv('DBassign1/2023-Cities_Climate_Risk.csv')

# Load base (org no and name)
dfbase = pd.read_csv('DBassign1/combined_org_no_and_name.csv')

# Print rows and columns for the 5 datasets
dataframes = [df1, df2, df3, df4, df5, dfbase]
for i, df in enumerate(dataframes, start=1):
    print(f"DataFrame {i}: {df.shape[0]} rows, {df.shape[1]} columns")

DataFrame 1: 280 rows, 15 columns
DataFrame 2: 187 rows, 27 columns
DataFrame 3: 229 rows, 31 columns
DataFrame 4: 406 rows, 21 columns
DataFrame 5: 1370 rows, 20 columns
DataFrame 6: 869 rows, 2 columns


In [120]:
# Variables for column names
# old
org_no = 'Organization_id'
org_name = 'Organization_name'
# new
intensity = 'intensity_unit'
gases = 'gases_included'

protocol = 'Protocol'
protocol_col = 'Protocol_column'

repyear = 'reporting_year'
sector = 'sector'
baseyear = 'baseline_year'
baseemissions = 'baseline_emissions'


df1.rename(columns={'Account No': org_no,'Organisation': org_name}, inplace=True)
df2.rename(columns={'Account Number': org_no,'City Name': org_name}, inplace=True)
df3.rename(columns={'Account number': org_no,'Organization': org_name}, inplace=True)
df4.rename(columns={'Account No': org_no,'Organisation': org_name}, inplace=True)
df5.rename(columns={'Organization Number': org_no,'Organization Name': org_name}, inplace=True)

## Normalisation of data for Emissions table

#### Intensity unit data (NOTE: Dropped, too few values)

In [121]:
# Column only exists in df4, specific name of it can be found with the following 2 lines
#cols_df4 = set(df4.columns)
#cols_df4

# Renaming the column, column name found from first running the above
df4.rename(columns={'\u200bIntensity unit (emissions per)': intensity}, inplace=True)

print(df4[intensity].head(3))
print(df4[intensity].unique())
print('----------------------------------------------------')
print('--------------------divider line--------------------')
print('----------------------------------------------------')
print(df4[intensity].value_counts(dropna=False))

0    NaN
1    NaN
2    NaN
Name: intensity_unit, dtype: object
[nan 'Metric tonnes of CO2e per capita'
 'The overall target is -13% total CO2e reduction off business-as-usual scenario by 2022'
 'The overall target is -13% total CO2e reduction off business-as-usual scenario by 2022. The target for transport only is -3.2% CO2e reduction off business-as-usual scenario.'
 'This is for the entire Durham County community, not just the city boundary, and includes residential/commercial/industrial buildings and transportation.'
 'Metric tonnes of CO2e per unit GDP'
 'Chu Chen, Mayor of Kaohsiung City had set up short term GHG reduction target, decreasing 20% by 2020 compared with the emissions in 2005. Therefore, city has to build up its future BAU for comparison with baseline emissions, to estimate how many should city reduce in order to reach the target.'
 'It is a material tom include the agricultural sector due to non-availability of land.']
------------------------------------------------

In [122]:
# From the above, it doesn't make sense to include this column
# (not that relevant)

#### Gasses included data

In [123]:
# Column only exists in df2 and df3

# Renaming the column, column name found from first running the above
df2.rename(columns={'Gases included': gases}, inplace=True)
df3.rename(columns={'Gases included': gases}, inplace=True)

print(df2[gases].head(3))
print(df2[gases].unique())
print('----------------------------------------------------')
print('--------------------divider line--------------------')
print('----------------------------------------------------')
print(df2[gases].value_counts(dropna=False))

print(df3[gases].head(3))
print(df3[gases].unique())
print('----------------------------------------------------')
print('--------------------divider line--------------------')
print('----------------------------------------------------')
print(df3[gases].value_counts(dropna=False))

0                          CO2; CH4; N2O
1    CO2; PFCs; CH4; SF6; N2O; NF3; HFCs
2                          CO2; CH4; N2O
Name: gases_included, dtype: object
['CO2; CH4; N2O' 'CO2; PFCs; CH4; SF6; N2O; NF3; HFCs' 'CO2'
 'CO2; PFCs; CH4; SF6; N2O; HFCs' 'CO2; CH4; SF6; N2O' nan
 'CO2; PFCs; CH4; N2O; HFCs' 'CO2; CH4; N2O; HFCs' 'CO2; CH4'
 'CO2; CH4; SF6' 'CO2; CH4; SF6; N2O; HFCs' 'CH4; N2O'
 'CO2; PFCs; CH4; SF6; N2O' 'CO2; PFCs; CH4; N2O']
----------------------------------------------------
--------------------divider line--------------------
----------------------------------------------------
CO2; CH4; N2O                          82
CO2                                    46
CO2; PFCs; CH4; SF6; N2O; HFCs         24
CO2; PFCs; CH4; SF6; N2O; NF3; HFCs    13
NaN                                     6
CO2; CH4; N2O; HFCs                     4
CO2; CH4                                4
CO2; CH4; SF6; N2O                      2
CO2; PFCs; CH4; N2O; HFCs               1
CO2; CH4; SF6   

In [124]:
# Now we need to add the C40 column to the 'final_df'
# So we need to check if ther are inconsistencies in the C40 value between datasets (say it might be false in 2017 reduction target but true in 
# 2023 climate risk for the same org number)


# Assuming the 'org id' column and 'C40' column are named 'org id' and 'C40' respectively in each df
dataframes = [df2, df3]

# Combine all 'org id' and 'C40' into one DataFrame
combined_gases = pd.concat([df[[org_no, gases]] for df in dataframes], ignore_index=True)

# Check if there are inconsistencies in 'C40' values for the same 'org id'
inconsistent_gases = combined_gases.groupby(org_no).filter(lambda x: x[gases].nunique() > 1)

if not inconsistent_gases.empty:
    print("Warning: Inconsistencies found in 'gases_included' values for the same 'org id'.")
    print(inconsistent_gases)
else:
    print("No inconsistencies in 'gases_included' values found.")

# Find the unique org numbers
inconsistent_gases[org_no].unique()

     Organization_id                  gases_included
33             31180                   CO2; CH4; N2O
39             35853                             CO2
53             31148                             CO2
97             31154              CO2; CH4; SF6; N2O
128            58621  CO2; PFCs; CH4; SF6; N2O; HFCs
130            31113        CO2; PFCs; CH4; SF6; N2O
148            43905                   CO2; CH4; N2O
210            35853                   CO2; CH4; N2O
227            31154  CO2; PFCs; CH4; SF6; N2O; HFCs
231            58621                   CO2; CH4; N2O
243            31180  CO2; PFCs; CH4; SF6; N2O; HFCs
318            43905              CO2; CH4; SF6; N2O
381            31113   CO2; PFCs; CH4; SF6; N2O; NF3
413            31148                        CO2; N2O


array([31180, 35853, 31148, 31154, 58621, 31113, 43905], dtype=int64)

In [125]:
# check if the above unique exists in df3 (it's 2017 vs 2016 of df2)
df3_inconsistent_gases = df3[df3[org_no].isin(inconsistent_gases[org_no].unique())]

print(f"DataFrame df3_inconsistent_gases: {df3_inconsistent_gases.shape[0]} rows, {df3_inconsistent_gases.shape[1]} columns")
df3_inconsistent_gases[[org_no,gases]]


DataFrame df3_inconsistent_gases: 7 rows, 31 columns


Unnamed: 0,Organization_id,gases_included
23,35853,CO2; CH4; N2O
40,31154,CO2; PFCs; CH4; SF6; N2O; HFCs
44,58621,CO2; CH4; N2O
56,31180,CO2; PFCs; CH4; SF6; N2O; HFCs
131,43905,CO2; CH4; SF6; N2O
194,31113,CO2; PFCs; CH4; SF6; N2O; NF3
226,31148,CO2; N2O


In [126]:
# now, we remove the above 7 org ids from combined_gases
print(f"DataFrame combined_gases: {combined_gases.shape[0]} rows, {combined_gases.shape[1]} columns")
combined_gases_without_inconsistent = combined_gases[~combined_gases[org_no].isin(inconsistent_gases[org_no].unique())]

# Should have 14 less rows
print(f"DataFrame combined_gases_without_inconsistent: {combined_gases_without_inconsistent.shape[0]} rows, {combined_gases_without_inconsistent.shape[1]} columns")


# now we add the above df3_inconsistent_gases to combined_gases_without_inconsistent
# while also dropping duplicates rows where org_no are the same
combined_unique_gases = pd.concat([combined_gases_without_inconsistent, df3_inconsistent_gases[[org_no,gases]]]).drop_duplicates(subset=[org_no])
print(f"DataFrame combined_unique_gases: {combined_unique_gases.shape[0]} rows, {combined_unique_gases.shape[1]} columns")
print(combined_unique_gases.nunique())
combined_unique_gases


DataFrame combined_gases: 416 rows, 2 columns
DataFrame combined_gases_without_inconsistent: 402 rows, 2 columns
DataFrame combined_unique_gases: 263 rows, 2 columns
Organization_id    263
gases_included      14
dtype: int64


Unnamed: 0,Organization_id,gases_included
0,35894,CO2; CH4; N2O
1,35898,CO2; PFCs; CH4; SF6; N2O; NF3; HFCs
2,54128,CO2; CH4; N2O
3,35879,CO2; CH4; N2O
4,50558,CO2; CH4; N2O
...,...,...
44,58621,CO2; CH4; N2O
56,31180,CO2; PFCs; CH4; SF6; N2O; HFCs
131,43905,CO2; CH4; SF6; N2O
194,31113,CO2; PFCs; CH4; SF6; N2O; NF3


### Final dataframe for emissions

In [127]:
final_gases_df = combined_unique_gases

print(f"DataFrame final_gases_df: {final_gases_df.shape[0]} rows, {final_gases_df.shape[1]} columns")
print(final_gases_df.nunique())
final_gases_df

DataFrame final_gases_df: 263 rows, 2 columns
Organization_id    263
gases_included      14
dtype: int64


Unnamed: 0,Organization_id,gases_included
0,35894,CO2; CH4; N2O
1,35898,CO2; PFCs; CH4; SF6; N2O; NF3; HFCs
2,54128,CO2; CH4; N2O
3,35879,CO2; CH4; N2O
4,50558,CO2; CH4; N2O
...,...,...
44,58621,CO2; CH4; N2O
56,31180,CO2; PFCs; CH4; SF6; N2O; HFCs
131,43905,CO2; CH4; SF6; N2O
194,31113,CO2; PFCs; CH4; SF6; N2O; NF3


In [128]:
# need to fill in all the other org ids, but they will have NaN in gases_included
import numpy as np

missing_org_ids = dfbase[~dfbase['Organization_id'].isin(final_gases_df['Organization_id'])]

new_rows = pd.DataFrame({
    'Organization_id': missing_org_ids['Organization_id'],
    'gases_included': np.nan  # Set NaN for the gasses column
})

final_emission_df = pd.concat([final_gases_df, new_rows], ignore_index=True)

In [129]:
# Resetting the index first, if needed
final_emission_df.reset_index(drop=True, inplace=True)
final_emission_df['emission_id'] = final_emission_df.index + 1

final_emission_df = final_emission_df[['emission_id', 'Organization_id', 'gases_included']]

final_emission_df


Unnamed: 0,emission_id,Organization_id,gases_included
0,1,35894,CO2; CH4; N2O
1,2,35898,CO2; PFCs; CH4; SF6; N2O; NF3; HFCs
2,3,54128,CO2; CH4; N2O
3,4,35879,CO2; CH4; N2O
4,5,50558,CO2; CH4; N2O
...,...,...,...
864,865,58424,
865,866,60229,
866,867,60577,
867,868,60588,


In [130]:
final_emission_df.to_csv('DBassign1/emission_table.csv', sep=';', index=False)

## Protocal table

#### Protocol

In [131]:
df3.rename(columns={'Protocol': protocol}, inplace=True)

print('--------------------Header--------------------')
print(df3[protocol].head(3))
print('--------------------Unique--------------------')
print(df3[protocol].unique())
print('--------------------Unique value count, also counts NaN--------------------')
print(df3[protocol].value_counts(dropna=False))
print('--------------------Number of uniques--------------------')
print(df3[protocol].nunique())

--------------------Header--------------------
0    Global Protocol for Community-Scale Greenhouse...
1    Global Protocol for Community-Scale Greenhouse...
2    Global Protocol for Community-Scale Greenhouse...
Name: Protocol, dtype: object
--------------------Unique--------------------
['Global Protocol for Community-Scale Greenhouse Gas Emissions Inventories (GPC), (WRI, C40 and ICLEI)'
 'U.S. Community Protocol for Accounting and Reporting of Greenhouse Gas Emissions (ICLEI)'
 'International Emissions Analysis Protocol (ICLEI)'
 '2006 IPCC Guidelines for National Greenhouse Gas Inventories'
 'Other: IPCC' 'Other: Hestia Project' 'Other'
 'Other: We compute scope 1 emissions for city and port and also scope 2 for the built environment'
 'Other: Excel' 'Other: Inventario de emisiones simplificado 2014'
 'Other: GHG Protocols'
 'Other: New York Community and Regional GHG Inventory Guidance'
 'Other: COPERT IV computer programme within the EMEP/CORINAR methodology'
 'Other: IFEU' 'Othe

#### Protocol_column

In [132]:
df3.rename(columns={'Protocol column': protocol_col}, inplace=True)

print('--------------------Header--------------------')
print(df3[protocol_col].head(3))
print('--------------------Unique--------------------')
print(df3[protocol_col].unique())
print('--------------------Unique value count, also counts NaN--------------------')
print(df3[protocol_col].value_counts(dropna=False))
print('--------------------Number of uniques--------------------')
print(df3[protocol_col].nunique())

--------------------Header--------------------
0                                                  NaN
1    Adaptation from Madrid Air Quality Inventory e...
2    ConEdison (ConEd) provided data on use of city...
Name: Protocol_column, dtype: object
--------------------Unique--------------------
[nan
 'Adaptation from Madrid Air Quality Inventory elaborated under the frame of EMEP CORINAIR METHOLOGY'
 'ConEdison (ConEd) provided data on use of citywide electricityand steam, and natural gas in the Bronx, Manhattan, and parts ofQueens. National Grid reported natural gas use data for Brooklyn,parts of Queens, and Staten Island. The Long Island Power Author-ity (LIPA) reported electricity use data for the Rockaways area ofQueens. Fuel oil use was provided by private fuel oil suppliers, perLocal Law 43 of 2013, which requires fuel oil providers to reportfuel oil deliveries by fuel type to the City on an annual basis. TheNew York Metropolitan Transportation Council (NYMTC) providedon-road tra

#### Combining the above 2 columns

In [133]:
protocol_table = df3[[org_no, protocol, protocol_col]]

print(f"DataFrame protocol_table: {protocol_table.shape[0]} rows, {protocol_table.shape[1]} columns")

protocol_table_no_dupes = df3[[org_no, protocol, protocol_col]].drop_duplicates(subset=[org_no])
print(f"DataFrame protocol_table_no_dupes: {protocol_table_no_dupes.shape[0]} rows, {protocol_table_no_dupes.shape[1]} columns")

protocol_table_no_dupes

DataFrame protocol_table: 229 rows, 3 columns
DataFrame protocol_table_no_dupes: 229 rows, 3 columns


Unnamed: 0,Organization_id,Protocol,Protocol_column
0,49363,Global Protocol for Community-Scale Greenhouse...,
1,31171,Global Protocol for Community-Scale Greenhouse...,Adaptation from Madrid Air Quality Inventory e...
2,3417,Global Protocol for Community-Scale Greenhouse...,ConEdison (ConEd) provided data on use of city...
3,59537,U.S. Community Protocol for Accounting and Rep...,"Since 2006, the City has utilized software dev..."
4,35894,Global Protocol for Community-Scale Greenhouse...,
...,...,...,...
224,19233,International Emissions Analysis Protocol (ICLEI),ICLEI - CCP Protocol (Cities for Climate Prote...
225,60273,Global Protocol for Community-Scale Greenhouse...,Para o primeiro inventário de emissões de gase...
226,31148,Other,"Local standard based on electricity, gas, dist..."
227,54119,U.S. Community Protocol for Accounting and Rep...,TCR/LGOP


In [134]:
final_protocol_df = protocol_table_no_dupes
final_protocol_df

Unnamed: 0,Organization_id,Protocol,Protocol_column
0,49363,Global Protocol for Community-Scale Greenhouse...,
1,31171,Global Protocol for Community-Scale Greenhouse...,Adaptation from Madrid Air Quality Inventory e...
2,3417,Global Protocol for Community-Scale Greenhouse...,ConEdison (ConEd) provided data on use of city...
3,59537,U.S. Community Protocol for Accounting and Rep...,"Since 2006, the City has utilized software dev..."
4,35894,Global Protocol for Community-Scale Greenhouse...,
...,...,...,...
224,19233,International Emissions Analysis Protocol (ICLEI),ICLEI - CCP Protocol (Cities for Climate Prote...
225,60273,Global Protocol for Community-Scale Greenhouse...,Para o primeiro inventário de emissões de gase...
226,31148,Other,"Local standard based on electricity, gas, dist..."
227,54119,U.S. Community Protocol for Accounting and Rep...,TCR/LGOP


## Base Emissions table

So, it's basically data from df1 and df4 (2016-Cities and 2017-Cities_Emission_Reduction_Targets)

So the division is by df

#### df1

In [135]:
set(df1.columns)    # this to get column names, to copy paste below

# reduce df1 to the columns we're interested in, .loc to prevent SettingWithCopyWarning error
df1_base = df1.loc[:, (org_no,'Reporting Year', 'Sector', 'Baseline year', 'Baseline emissions (metric tonnes CO2e)')]

# rename
df1_base.rename(columns={'Reporting Year': repyear, 'Sector': sector, 
                    'Baseline year': baseyear, 
                    'Baseline emissions (metric tonnes CO2e)': baseemissions,}, inplace=True)

# display
df1_base

Unnamed: 0,Organization_id,reporting_year,sector,baseline_year,baseline_emissions
0,58796,2016,Total,2010,6136.00
1,36158,2016,Total,2005,2913434.00
2,62855,2016,Total,2009,268000.00
3,61753,2016,Total,2009,3804493.00
4,61790,2016,Total,2004,178832.00
...,...,...,...,...,...
275,60680,2016,Total,2009,198408.23
276,31174,2016,Total,1990,63443619.00
277,31165,2016,Total,1990,
278,59552,2016,Total,2010,348437.00


#### df2

In [136]:
set(df4.columns)    # this to get column names, to copy paste below

# reduce df4 to the columns we're interested in, .loc to prevent SettingWithCopyWarning error
df4_base = df4.loc[:, (org_no,'Reporting year', 'Sector', 'Baseline year', 'Baseline emissions (metric tonnes CO2e)')]

# rename
df4_base.rename(columns={'Reporting year': repyear, 'Sector': sector, 
                    'Baseline year': baseyear, 
                    'Baseline emissions (metric tonnes CO2e)': baseemissions,}, inplace=True)

# display
df4_base

Unnamed: 0,Organization_id,reporting_year,sector,baseline_year,baseline_emissions
0,54408,2017,,2008,
1,63616,2017,Buildings,2010,18320.0
2,63616,2017,Transport,2010,6893.0
3,1499,2017,Total,2008,4053.0
4,1499,2017,Buildings,2008,2164.0
...,...,...,...,...,...
401,43937,2017,Total,2001,1105654.0
402,43937,2017,Total,2001,1105654.0
403,43937,2017,Total,2001,1105654.0
404,43937,2017,Other: Corporate,2014,92832.0


#### Combining the above 2 into one dataframe

In [137]:
# combine the 2
base_emissions_table = pd.concat([df1_base, df4_base], ignore_index=True)

print(base_emissions_table.nunique())
base_emissions_table

Organization_id       267
reporting_year          2
sector                 38
baseline_year          26
baseline_emissions    374
dtype: int64


Unnamed: 0,Organization_id,reporting_year,sector,baseline_year,baseline_emissions
0,58796,2016,Total,2010,6136.0
1,36158,2016,Total,2005,2913434.0
2,62855,2016,Total,2009,268000.0
3,61753,2016,Total,2009,3804493.0
4,61790,2016,Total,2004,178832.0
...,...,...,...,...,...
681,43937,2017,Total,2001,1105654.0
682,43937,2017,Total,2001,1105654.0
683,43937,2017,Total,2001,1105654.0
684,43937,2017,Other: Corporate,2014,92832.0


In [138]:
# Looking at the above table, something doesn't seem right

# so a check is done on the org_no 43937
res = base_emissions_table[base_emissions_table[org_no] == 43937]
print(res.nunique())
res

Organization_id       1
reporting_year        2
sector                2
baseline_year         2
baseline_emissions    3
dtype: int64


Unnamed: 0,Organization_id,reporting_year,sector,baseline_year,baseline_emissions
182,43937,2016,Total,,1310705.0
245,43937,2016,Total,2001.0,1310705.0
246,43937,2016,Total,2001.0,1310705.0
247,43937,2016,Total,2001.0,1310705.0
680,43937,2017,Total,2001.0,1105654.0
681,43937,2017,Total,2001.0,1105654.0
682,43937,2017,Total,2001.0,1105654.0
683,43937,2017,Total,2001.0,1105654.0
684,43937,2017,Other: Corporate,2014.0,92832.0


In [139]:
# Look at the above, it becomes clear that there are some complete duplicate rows

# checking df4, it becomes clear, that the rows are different in some other columns
# but not in the 4 columns for the baseline emission table
res = df4[df4[org_no] == 43937]
res

Unnamed: 0,Organization_id,Organization_name,City,Country,Region,Access,C40,Reporting year,Type of target,Sector,...,Baseline emissions (metric tonnes CO2e),Percentage reduction target,Target date,Estimated business as usual absolute emissions in target year (metric tonnes CO2e),intensity_unit,Comment,Population,Population Year,City Location,Country Location
400,43937,Wellington City Council,Wellington,New Zealand,South Asia and Oceania,Public,,2017,Absolute target,Total,...,1105654.0,10.0,2020,,,,209102.0,2017.0,"(26.661763, -80.268357)","(-40.900557, 174.885971)"
401,43937,Wellington City Council,Wellington,New Zealand,South Asia and Oceania,Public,,2017,Absolute target,Total,...,1105654.0,40.0,2030,,,,209102.0,2017.0,"(26.661763, -80.268357)","(-40.900557, 174.885971)"
402,43937,Wellington City Council,Wellington,New Zealand,South Asia and Oceania,Public,,2017,Absolute target,Total,...,1105654.0,65.0,2040,,,,209102.0,2017.0,"(26.661763, -80.268357)","(-40.900557, 174.885971)"
403,43937,Wellington City Council,Wellington,New Zealand,South Asia and Oceania,Public,,2017,Absolute target,Total,...,1105654.0,80.0,2050,,,,209102.0,2017.0,"(26.661763, -80.268357)","(-40.900557, 174.885971)"
404,43937,Wellington City Council,Wellington,New Zealand,South Asia and Oceania,Public,,2017,Absolute target,Other: Corporate,...,92832.0,80.0,2050,,,,209102.0,2017.0,"(26.661763, -80.268357)","(-40.900557, 174.885971)"


In [140]:
# So removing of duplicates are necessary
final_base_emission_df = base_emissions_table.drop_duplicates(
    subset=[org_no, repyear, sector, baseyear, baseemissions])

# Also make sure that rows with all NaN are dropped (even if org_no has a value)
final_base_emission_df = final_base_emission_df.dropna(
    subset=[repyear, sector, baseyear, baseemissions])

# Resetting the index
final_base_emission_df = final_base_emission_df.reset_index(drop=True)

# Display info about it
print(final_base_emission_df.nunique())
final_base_emission_df

Organization_id       218
reporting_year          2
sector                 37
baseline_year          25
baseline_emissions    363
dtype: int64


Unnamed: 0,Organization_id,reporting_year,sector,baseline_year,baseline_emissions
0,58796,2016,Total,2010,6136.0
1,36158,2016,Total,2005,2913434.0
2,62855,2016,Total,2009,268000.0
3,61753,2016,Total,2009,3804493.0
4,61790,2016,Total,2004,178832.0
...,...,...,...,...,...
405,35894,2017,Total,1990,14786.0
406,59160,2017,Stationary energy (buildings),2016,3.0
407,36154,2017,Total,2003,1734000.0
408,43937,2017,Total,2001,1105654.0


In [141]:
# Resetting the index first, if needed
final_base_emission_df.reset_index(drop=True, inplace=True)
final_base_emission_df['base_emission_id'] = final_base_emission_df.index + 1

final_base_emission_df = final_base_emission_df[['base_emission_id', 'reporting_year', 'sector', 'baseline_year', 'baseline_emissions', 'Organization_id']]

final_base_emission_df


Unnamed: 0,base_emission_id,reporting_year,sector,baseline_year,baseline_emissions,Organization_id
0,1,2016,Total,2010,6136.0,58796
1,2,2016,Total,2005,2913434.0,36158
2,3,2016,Total,2009,268000.0,62855
3,4,2016,Total,2009,3804493.0,61753
4,5,2016,Total,2004,178832.0,61790
...,...,...,...,...,...,...
405,406,2017,Total,1990,14786.0,35894
406,407,2017,Stationary energy (buildings),2016,3.0,59160
407,408,2017,Total,2003,1734000.0,36154
408,409,2017,Total,2001,1105654.0,43937


In [142]:
# final_base_emission_df.to_csv('DBassign1/base_emission_table.csv', sep=';', index=False)

## Other

In [143]:
print(f"Any duplicates in Organization_id? Answer: {final_emission_df.duplicated().any()}")
final_emission_df

Any duplicates in Organization_id? Answer: False


Unnamed: 0,emission_id,Organization_id,gases_included
0,1,35894,CO2; CH4; N2O
1,2,35898,CO2; PFCs; CH4; SF6; N2O; NF3; HFCs
2,3,54128,CO2; CH4; N2O
3,4,35879,CO2; CH4; N2O
4,5,50558,CO2; CH4; N2O
...,...,...,...
864,865,58424,
865,866,60229,
866,867,60577,
867,868,60588,


In [144]:
final_base_emission_df

Unnamed: 0,base_emission_id,reporting_year,sector,baseline_year,baseline_emissions,Organization_id
0,1,2016,Total,2010,6136.0,58796
1,2,2016,Total,2005,2913434.0,36158
2,3,2016,Total,2009,268000.0,62855
3,4,2016,Total,2009,3804493.0,61753
4,5,2016,Total,2004,178832.0,61790
...,...,...,...,...,...,...
405,406,2017,Total,1990,14786.0,35894
406,407,2017,Stationary energy (buildings),2016,3.0,59160
407,408,2017,Total,2003,1734000.0,36154
408,409,2017,Total,2001,1105654.0,43937


In [145]:
# now need to have final_base_emission_df point to emission_id and remove Organization_id

# Assuming emission_df_updated has 'org_id' and 'emission_id' columns
emission_mapping = final_emission_df[['Organization_id', 'emission_id']].set_index('Organization_id')['emission_id']

# Map 'org_id' in base_emission_df to 'emission_id' using the mapping
# The 'map' function replaces each 'org_id' with its corresponding 'emission_id'
final_base_emission_df['emission_id'] = final_base_emission_df['Organization_id'].map(emission_mapping)

# drop Organization_id column
final_base_emission_df.drop(columns=['Organization_id'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_base_emission_df['emission_id'] = final_base_emission_df['Organization_id'].map(emission_mapping)


In [146]:
final_base_emission_df

Unnamed: 0,base_emission_id,reporting_year,sector,baseline_year,baseline_emissions,emission_id
0,1,2016,Total,2010,6136.0,264
1,2,2016,Total,2005,2913434.0,265
2,3,2016,Total,2009,268000.0,236
3,4,2016,Total,2009,3804493.0,161
4,5,2016,Total,2004,178832.0,172
...,...,...,...,...,...,...
405,406,2017,Total,1990,14786.0,1
406,407,2017,Stationary energy (buildings),2016,3.0,225
407,408,2017,Total,2003,1734000.0,47
408,409,2017,Total,2001,1105654.0,162


In [147]:
final_base_emission_df.to_csv('DBassign1/base_emission_table.csv', sep=';', index=False)