In [2]:
import pandas as pd

# Load the 5 datasets
df1 = pd.read_csv('DBassign1/2016-Cities.csv')
df2 = pd.read_csv('DBassign1/2016-GHG_Emissions.csv')
df3 = pd.read_csv('DBassign1/2017-Community_Emissions.csv')
df4 = pd.read_csv('DBassign1/2017-Cities_Emission_Reduction_Targets.csv')
df5 = pd.read_csv('DBassign1/2023-Cities_Climate_Risk.csv')

# Print rows and columns for the 5 datasets
dataframes = [df1, df2, df3, df4, df5]
for i, df in enumerate(dataframes, start=1):
    print(f"DataFrame {i}: {df.shape[0]} rows, {df.shape[1]} columns")

DataFrame 1: 280 rows, 15 columns
DataFrame 2: 187 rows, 27 columns
DataFrame 3: 229 rows, 31 columns
DataFrame 4: 406 rows, 21 columns
DataFrame 5: 1370 rows, 20 columns


## Normalisation of data

### Org Table

In [3]:
# variables for column names
org_no = 'Organization_id'
org_name = 'Organization_name'
c40 = 'C40_member'
GCoM = 'GCoM_member'
acc_year = 'accounting_year'
org_bound = 'Organization_boundary'

## The dataframe will be called 'final_df' for the dataframe with the above NOTE: naming should be improved for the other tables

#### Org number and name normalisation

In [4]:
df1.head(1)
df1.rename(columns={'Account No': org_no,'Organisation': org_name}, inplace=True)
df1.head(1)


Unnamed: 0,Organization_name,Organization_id,Country,City Short Name,C40,Reporting Year,Sector,Target boundary,Baseline year,Baseline emissions (metric tonnes CO2e),Percentage reduction target,Target date,Comment,City Location,Country Location
0,Odder Kommune,58796,Denmark,Odder Kommune,,2016,Total,,2010,6136.0,2.0,,,"(55.975718, 10.149958)","(56.26392, 9.501785)"


In [5]:
df2.head(1)
df2.rename(columns={'Account Number': org_no,'City Name': org_name}, inplace=True)
df2.head(1)

Unnamed: 0,Organization_id,Organization_name,Country,City Short Name,C40,Reporting Year,Measurement Year,Boundary,Primary Methodology,Methodology Details,...,Current Population,City GDP,GDP Currency,Year of GDP,GDP Source,Average annual temperature (in Celsius)​,​Land area (in square km),Average altitude (m),City Location,Country Location
0,35894,Ville de Montreal,Canada,Montreal,,2016,12/31/2009 12:00:00 AM,Other: The regional entity that constitutes th...,2006 IPCC Guidelines for National Greenhouse G...,2006 IPCC Guidelines for National Greenhouse G...,...,1886481.0,120118000000.0,CAD Canadian Dollar,2014.0,"Ville de Montréal, Montréal en statistiques, h...",6.8,500.0,17.0,"(45.5086699, -73.5539925)","(56.130366, -106.346771)"


In [6]:
df3.head(1)
df3.rename(columns={'Account number': org_no,'Organization': org_name}, inplace=True)
df3.head(1)

Unnamed: 0,Organization_id,Organization_name,City,Country,Region,C40,Access,Reporting year,Accounting year,Boundary,...,Population year,GDP,GDP Currency,GDP Year,GDP Source,Average annual temperature (in Celsius)​,​Average altitude (m),​Land area (in square km),City Location,Country Location
0,49363,Nelson Mandela Bay Municipality,Nelson Mandela Bay,South Africa,Africa,,Public,2017,2013-07-01 - 2014-06-30,A metropolitan area,...,2011,52147.0,ZAR South African Rand,2010.0,Built Environment Performance Plan(BEPP),17.5,69.0,1950.0,"(-33.745241, 25.568108)","(-30.559482, 22.937506)"


In [7]:
df4.head(1)
df4.rename(columns={'Account No': org_no,'Organisation': org_name}, inplace=True)
df4.head(1)

Unnamed: 0,Organization_id,Organization_name,City,Country,Region,Access,C40,Reporting year,Type of target,Sector,...,Baseline emissions (metric tonnes CO2e),Percentage reduction target,Target date,Estimated business as usual absolute emissions in target year (metric tonnes CO2e),​Intensity unit (emissions per),Comment,Population,Population Year,City Location,Country Location
0,54408,Aarhus Kommune,Aarhus,Denmark,Europe,Public,,2017,Absolute target,,...,,100.0,2030,,,,336000.0,2017.0,"(56.168393, 10.137373)","(56.26392, 9.501785)"


In [8]:
df5.head(1)
df5.rename(columns={'Organization Number': org_no,'Organization Name': org_name}, inplace=True)
df5.head(1)

Unnamed: 0,Questionnaire,Organization_id,Organization_name,City,Country/Area,CDP Region,C40 City,GCoM City,Access,Assessment attachment and/or direct link,Confirm attachment/link provided,Boundary of assessment relative to jurisdiction boundary,Year of publication or approval,Factors considered in assessment,Primary author(s) of assessment,Does the city have adaptation goal(s) and/or an adaptation plan?,Population,Population Year,City Location,Last update
0,Cities 2023,840926,Prefeitura de Serra Talhada,,Brazil,Latin America,False,True,public,https://drive.google.com/file/d/19DMxxK532IQSL...,The assessment can be accessed (unrestricted) ...,Same - covers entire jurisdiction and nothing ...,2022.0,Assessment considers vulnerable populations; A...,Dedicated team within jurisdiction; Relevant d...,Adaptation goal(s) and adaptation plan,92228,2023.0,,02/07/2024 04:14:16 AM


#### Combining org number and name into a new dataframe

In [9]:
# This takes all rows of all 5 datasets, but only adds the org name and number
combined_df = pd.concat([
    df1[[org_no, org_name]],
    df2[[org_no, org_name]],
    df3[[org_no, org_name]],
    df4[[org_no, org_name]],
    df5[[org_no, org_name]],
])

# Then all duplicates are dropped to get unique pairs
newdf = combined_df.drop_duplicates().reset_index(drop=True)
newdf.nunique()

Organization_id      869
Organization_name    983
dtype: int64

In [10]:
# Group by 'Organization Number' and filter to find duplicates with different 'Organization Name'
duplicate_org_numbers = newdf.groupby(org_no).filter(lambda x: x[org_name].nunique() > 1)

# To see each 'Organization Number' with its associated 'Organization Names'
duplicates_summary = duplicate_org_numbers.groupby(org_no)[org_name].unique()
print(duplicates_summary)


Organization_id
1093                [City of Atlanta, City of Atlanta, GA]
1184                  [City of Austin, City of Austin, TX]
1499          [Ajuntament de Barcelona, City of Barcelona]
3203                [City of Chicago, City of Chicago, IL]
3417                    [New York City, New York City, NY]
                               ...                        
60577    [Frederikshavn Kommune, Frederikshavn Municipa...
60588             [City of Alba-Iulia, City of Alba Iulia]
61753              [Yilan County, Yilan County Government]
63543      [Fredensborg Kommune, Fredensborg Municipality]
64014           [City of Cupertino, City of Cupertino, CA]
Name: Organization_name, Length: 114, dtype: object


In [11]:
# Function to return the row with the longest 'Organization Name' for each 'Organization Number'
def keep_longest_name(group):
    lengths = group[org_name].str.len()  # Get the length of each organization name
    return group.loc[lengths.idxmax()]  # Return the row with the longest organization name

# Apply the function to each group of duplicates
longest_names_df = duplicate_org_numbers.groupby(org_no).apply(keep_longest_name).reset_index(drop=True)
print(longest_names_df)
# Now, longest_names_df contains only the rows with the longest 'Organization Name' for each 'Organization Number'


     Organization_id           Organization_name
0               1093         City of Atlanta, GA
1               1184          City of Austin, TX
2               1499     Ajuntament de Barcelona
3               3203         City of Chicago, IL
4               3417           New York City, NY
..               ...                         ...
109            60577  Frederikshavn Municipality
110            60588          City of Alba-Iulia
111            61753     Yilan County Government
112            63543    Fredensborg Municipality
113            64014       City of Cupertino, CA

[114 rows x 2 columns]


  longest_names_df = duplicate_org_numbers.groupby(org_no).apply(keep_longest_name).reset_index(drop=True)


In [12]:
# Step 1: Drop rows in newdf that have a matching 'Organization Number' in longest_names_df
org_numbers_to_remove = longest_names_df[org_no].unique()
newdf_filtered = newdf[~newdf[org_no].isin(org_numbers_to_remove)]

# Step 2: Concatenate newdf_filtered and longest_names_df
final_df = pd.concat([newdf_filtered, longest_names_df], ignore_index=True)
final_df

# final_df now contains the original data from newdf with duplicates removed, 
# and replaced by the rows with the longest 'Organization Name' for each 'Organization Number'


Unnamed: 0,Organization_id,Organization_name
0,58796,Odder Kommune
1,36158,Comune di Napoli
2,62855,Egedal Municipality
3,61790,"City of Emeryville, CA"
4,62180,Communauté urbaine du Grand Nancy
...,...,...
864,60577,Frederikshavn Municipality
865,60588,City of Alba-Iulia
866,61753,Yilan County Government
867,63543,Fredensborg Municipality


In [13]:
# To make sure that there are zero nulls in Organization Number and Name column
num_nulls_num = final_df[org_no].isnull().sum()
print(f"Number of nulls in {org_no}: {num_nulls_num}")
num_nulls_name = final_df[org_no].isnull().sum()
print(f"Number of nulls in {org_name}: {num_nulls_name}")

# To check if there are duplicates
print({final_df[org_no].nunique})
print({final_df[org_name].nunique})

Number of nulls in Organization_id: 0
Number of nulls in Organization_name: 0
{<bound method IndexOpsMixin.nunique of 0      58796
1      36158
2      62855
3      61790
4      62180
       ...  
864    60577
865    60588
866    61753
867    63543
868    64014
Name: Organization_id, Length: 869, dtype: int64>}
{<bound method IndexOpsMixin.nunique of 0                          Odder Kommune
1                       Comune di Napoli
2                    Egedal Municipality
3                 City of Emeryville, CA
4      Communauté urbaine du Grand Nancy
                     ...                
864           Frederikshavn Municipality
865                   City of Alba-Iulia
866              Yilan County Government
867             Fredensborg Municipality
868                City of Cupertino, CA
Name: Organization_name, Length: 869, dtype: object>}


#### Normalisation of C40 

In [14]:
# Need to change df5 'C40 City'
df5['C40 City'].head(2)

0    False
1     True
Name: C40 City, dtype: bool

In [15]:
# Renaming the column
df5.rename(columns={'C40 City': c40}, inplace=True)
df5[c40].head(2)

0    False
1     True
Name: C40_member, dtype: bool

In [16]:
# Df1 to 4 all have the 'C40' column, and should only have the unique values 'C40' and null
dfs = [df1, df2, df3, df4]

# Iterate through each DataFrame and print unique values in the "C40" column
for i, df in enumerate(dfs, start=1):
    unique_values = df['C40'].unique()
    print(f"Unique values in C40 column of DataFrame {i}: {unique_values}")

Unique values in C40 column of DataFrame 1: [nan 'C40']
Unique values in C40 column of DataFrame 2: [nan 'C40']
Unique values in C40 column of DataFrame 3: [nan 'C40']
Unique values in C40 column of DataFrame 4: [nan 'C40']


In [17]:
# Assuming your DataFrames are named df1, df2, df3, df4
dataframes = [df1, df2, df3, df4]

# Iterate through each DataFrame and update the "C40" column
for df in dataframes:
    df['C40'] = df['C40'] == 'C40'

# At this point, each DataFrame will have the "C40" column with True where the value was 'C40', and False otherwise (including null values)

In [18]:
# Running this again to prove that it's only false and true now
# Iterate through each DataFrame and print unique values in the "C40" column
for i, df in enumerate(dfs, start=1):
    unique_values = df['C40'].unique()
    print(f"Unique values in C40 column of DataFrame {i}: {unique_values}")

Unique values in C40 column of DataFrame 1: [False  True]
Unique values in C40 column of DataFrame 2: [False  True]
Unique values in C40 column of DataFrame 3: [False  True]
Unique values in C40 column of DataFrame 4: [False  True]


In [19]:
df1.rename(columns={'C40': c40}, inplace=True)
df2.rename(columns={'C40': c40}, inplace=True)
df3.rename(columns={'C40': c40}, inplace=True)
df4.rename(columns={'C40': c40}, inplace=True)

df3[c40].head(2)

0    False
1     True
Name: C40_member, dtype: bool

In [20]:
# Now we need to add the C40 column to the 'final_df'
# So we need to check if ther are inconsistencies in the C40 value between datasets (say it might be false in 2017 reduction target but true in 
# 2023 climate risk for the same org number)


# Assuming the 'org id' column and 'C40' column are named 'org id' and 'C40' respectively in each df
dataframes = [df1, df2, df3, df4, df5]

# Combine all 'org id' and 'C40' into one DataFrame
combined_c40 = pd.concat([df[[org_no, c40]] for df in dataframes], ignore_index=True)

# Check if there are inconsistencies in 'C40' values for the same 'org id'
inconsistent_c40 = combined_c40.groupby(org_no).filter(lambda x: x[c40].nunique() > 1)

if not inconsistent_c40.empty:
    print("Warning: Inconsistencies found in 'C40' values for the same 'org id'.")
    print(inconsistent_c40)
else:
    print("No inconsistencies in 'C40' values found.")

      Organization_id  C40_member
22              36159       False
74              31151        True
75              31151        True
92              36254        True
93              36254        True
...               ...         ...
2223            35885        True
2226            31151       False
2312            35870        True
2318            36254       False
2376            35885        True

[61 rows x 2 columns]


In [21]:
# Find the unique org numbers
inconsistent_c40[org_no].unique()

array([36159, 31151, 36254, 35894, 31186, 35874, 35885, 35870])

In [22]:
# 
unique_inconsistent_c40 = df5.loc[df5[org_no].isin([36159, 31151, 36254, 35894, 31186, 35874, 35885, 35870])]
# Select the 'org no' and 'C40' columns, then drop duplicates to get unique pairs
unique_pairs = unique_inconsistent_c40[[org_no, c40]].drop_duplicates()

# Display the unique pairs
print(unique_pairs)


      Organization_id  C40_member
93              35894        True
111             31186       False
135             35885        True
167             35870        True
421             35874        True
477             36159        True
636             31151       False
1216            36254       False


In [23]:
# Assuming 'org_no' is the column name in both combined_c40 and unique_pairs DataFrames
# Get a list of org_no values to remove from combined_c40
org_no_to_remove = unique_pairs[org_no].unique()

# Filter out rows from combined_c40 where org_no matches any value in org_no_to_remove
combined_c40_filtered = combined_c40[~combined_c40[org_no].isin(org_no_to_remove)]

# combined_c40_filtered now contains rows from combined_c40 excluding those with org_no in unique_pairs
unique_pairs_combined = combined_c40_filtered[[org_no, c40]].drop_duplicates()

# Display the unique pairs
print(unique_pairs_combined)


      Organization_id  C40_member
0               58796       False
1               36158       False
2               62855       False
3               61753       False
4               61790       False
...               ...         ...
2454            73671       False
2457            54306       False
2461            31178        True
2466           863001       False
2471           924874       False

[861 rows x 2 columns]


In [24]:
# Combine the 2 unique pairs dfs into 1
combined_unique_pairs = pd.concat([unique_pairs_combined, unique_pairs]).drop_duplicates(subset=[org_no])
combined_unique_pairs

Unnamed: 0,Organization_id,C40_member
0,58796,False
1,36158,False
2,62855,False
3,61753,False
4,61790,False
...,...,...
167,35870,True
421,35874,True
477,36159,True
636,31151,False


In [25]:
final_df_with_c40 = pd.merge(final_df, combined_unique_pairs, on=org_no, how='left')
final_df_with_c40

Unnamed: 0,Organization_id,Organization_name,C40_member
0,58796,Odder Kommune,False
1,36158,Comune di Napoli,False
2,62855,Egedal Municipality,False
3,61790,"City of Emeryville, CA",False
4,62180,Communauté urbaine du Grand Nancy,False
...,...,...,...
864,60577,Frederikshavn Municipality,False
865,60588,City of Alba-Iulia,False
866,61753,Yilan County Government,False
867,63543,Fredensborg Municipality,False


In [26]:
final_df_with_c40[c40].unique()
final_df_with_c40_falseTrue = df5.loc[df5[org_no].isin([58796, 36158, 62855, 61790, 62180, 60577, 60588, 61753, 63543, 64014])]
print(final_df_with_c40_falseTrue[[org_no, c40]])

      Organization_id  C40_member
296             64014       False
673             64014       False
675             60577       False
873             60588       False
985             63543       False
1114            60577       False


#### Normalisation of GCoM

In [27]:
# Need to change df5 'C40 City'
df5['GCoM City'].head(3)

0     True
1    False
2    False
Name: GCoM City, dtype: bool

In [28]:
# Renaming the column
df5.rename(columns={'GCoM City': GCoM}, inplace=True)
df5[GCoM].head(2)

0     True
1    False
Name: GCoM_member, dtype: bool

In [29]:
# Since only df5 has GCoM, we only need to get unique pairs between org_no and GCoM for df5
unique_pairs_gcom = df5[[org_no, GCoM]].drop_duplicates()
unique_pairs_gcom


Unnamed: 0,Organization_id,GCoM_member
0,840926,True
1,51075,False
2,863190,False
3,930366,True
4,60236,True
...,...,...
1356,31187,True
1357,44191,True
1359,31178,True
1364,863001,False


In [30]:
# Now we merge the unique_pairs_gcom into our final_df(_with_c40)
final_df_with_gcom = pd.merge(final_df_with_c40, unique_pairs_gcom, on=org_no, how='left')
final_df_with_gcom

Unnamed: 0,Organization_id,Organization_name,C40_member,GCoM_member
0,58796,Odder Kommune,False,
1,36158,Comune di Napoli,False,
2,62855,Egedal Municipality,False,
3,61790,"City of Emeryville, CA",False,
4,62180,Communauté urbaine du Grand Nancy,False,
...,...,...,...,...
864,60577,Frederikshavn Municipality,False,True
865,60588,City of Alba-Iulia,False,True
866,61753,Yilan County Government,False,
867,63543,Fredensborg Municipality,False,True


In [31]:
# We don't change the NaN values in GCoM_member to false, because they might be 'True' in reality, so we keep them as NaN meaning don't know

### xxx