In [183]:
import pandas as pd

# Load the 5 datasets
df1 = pd.read_csv('DBassign1/2016-Cities.csv')
df2 = pd.read_csv('DBassign1/2016-GHG_Emissions.csv')
df3 = pd.read_csv('DBassign1/2017-Community_Emissions.csv')
df4 = pd.read_csv('DBassign1/2017-Cities_Emission_Reduction_Targets.csv')
df5 = pd.read_csv('DBassign1/2023-Cities_Climate_Risk.csv')

# Load base (org no and name)
dfbase = pd.read_csv('DBassign1/combined_org_no_and_name.csv')
# Load emission table
dfemission = pd.read_csv('DBassign1/emission_table.csv', sep=';')

# Print rows and columns for the 5 datasets
dataframes = [df1, df2, df3, df4, df5, dfbase, dfemission]
for i, df in enumerate(dataframes, start=1):
    print(f"DataFrame {i}: {df.shape[0]} rows, {df.shape[1]} columns")

DataFrame 1: 280 rows, 15 columns
DataFrame 2: 187 rows, 27 columns
DataFrame 3: 229 rows, 31 columns
DataFrame 4: 406 rows, 21 columns
DataFrame 5: 1370 rows, 20 columns
DataFrame 6: 869 rows, 2 columns
DataFrame 7: 869 rows, 3 columns


In [184]:
# Variables for column names
# old
org_no = 'Organization_id'
org_name = 'Organization_name'
# new
reduc_target = 'pct_reduction_target'
target_year = 'target_year'
estimated = 'Estimated_business_as_usual'
target_bound = 'Target_boundary'
emission_id = 'emission_id'
sector = 'sector'

type_target = 'type_of_target'


df1.rename(columns={'Account No': org_no,'Organisation': org_name}, inplace=True)
df2.rename(columns={'Account Number': org_no,'City Name': org_name}, inplace=True)
df3.rename(columns={'Account number': org_no,'Organization': org_name}, inplace=True)
df4.rename(columns={'Account No': org_no,'Organisation': org_name}, inplace=True)
df5.rename(columns={'Organization Number': org_no,'Organization Name': org_name}, inplace=True)

## Normalisation for Target Emission table

Data is from df1 and df4 ('DBassign1/2016-Cities.csv') and ('DBassign1/2017-Cities_Emission_Reduction_Targets.csv')


#### Data from df1

In [185]:
set(df1.columns)    # this to get column names, to copy paste below

# reduce df1 to the columns we're interested in, .loc to prevent SettingWithCopyWarning error
df1_base = df1.loc[:, (org_no,'Percentage reduction target', 'Target date', 'Target boundary', 'Sector')]

# rename
df1_base.rename(columns={'Percentage reduction target': reduc_target, 'Target date': target_year, 
                    'Target boundary': target_bound, 'Sector': sector,}, inplace=True)

# display
df1_base

Unnamed: 0,Organization_id,pct_reduction_target,target_year,Target_boundary,sector
0,58796,2.0,,,Total
1,36158,25.0,2020.0,,Total
2,62855,7.0,2020.0,,Total
3,61753,35.0,2020.0,,Total
4,61790,40.0,2030.0,Overall community emissions,Total
...,...,...,...,...,...
275,60680,20.0,2020.0,Community sector:\n- Waste management\n- road ...,Total
276,31174,25.0,2020.0,Total emissions within the inventory boundary,Total
277,31165,95.0,2050.0,,Total
278,59552,100.0,2050.0,Municipal boundary/city limits,Total


#### Data from df4

In [186]:
set(df4.columns)    # this to get column names, to copy paste below

# Type of target is also included, will be separated out later down the line

# reduce df1 to the columns we're interested in, .loc to prevent SettingWithCopyWarning error
df4_base = df4.loc[:, (org_no,'Percentage reduction target', 'Target date', 'Estimated business as usual absolute emissions in target year (metric tonnes CO2e)', 'Type of target', 'Sector')]

# rename
df4_base.rename(columns={'Percentage reduction target': reduc_target, 'Target date': target_year, 
                    'Estimated business as usual absolute emissions in target year (metric tonnes CO2e)': estimated, 
                    'Type of target': type_target, 'Sector': sector,}, inplace=True)

# display
df4_base

Unnamed: 0,Organization_id,pct_reduction_target,target_year,Estimated_business_as_usual,type_of_target,sector
0,54408,100.0,2030,,Absolute target,
1,63616,19.0,2020,,Absolute target,Buildings
2,63616,6.0,2020,,Absolute target,Transport
3,1499,18.0,2020,,Absolute target,Total
4,1499,1.0,2020,,Absolute target,Buildings
...,...,...,...,...,...,...
401,43937,40.0,2030,,Absolute target,Total
402,43937,65.0,2040,,Absolute target,Total
403,43937,80.0,2050,,Absolute target,Total
404,43937,80.0,2050,,Absolute target,Other: Corporate


#### Combining the above 2 into one dataframe

In [187]:
# combine the 2
target_emissions_table = pd.concat([df1_base, df4_base], ignore_index=True)

print(target_emissions_table.nunique())
target_emissions_table

Organization_id                267
pct_reduction_target            83
target_year                     39
Target_boundary                152
sector                          38
Estimated_business_as_usual     30
type_of_target                   3
dtype: int64


Unnamed: 0,Organization_id,pct_reduction_target,target_year,Target_boundary,sector,Estimated_business_as_usual,type_of_target
0,58796,2.0,,,Total,,
1,36158,25.0,2020.0,,Total,,
2,62855,7.0,2020.0,,Total,,
3,61753,35.0,2020.0,,Total,,
4,61790,40.0,2030.0,Overall community emissions,Total,,
...,...,...,...,...,...,...,...
681,43937,40.0,2030,,Total,,Absolute target
682,43937,65.0,2040,,Total,,Absolute target
683,43937,80.0,2050,,Total,,Absolute target
684,43937,80.0,2050,,Other: Corporate,,Absolute target


In [188]:
# Lets see how many NaN there are in each column

# Calculate the number of NaN values in each column of the DataFrame
nan_counts = target_emissions_table.isna().sum()

# Print the number of NaN values for each column
print(nan_counts)

# It's definitely a choice to drop Target_boundary or Estimated_business_as_usual

Organization_id                  0
pct_reduction_target            74
target_year                     66
Target_boundary                464
sector                         120
Estimated_business_as_usual    655
type_of_target                 280
dtype: int64


In [189]:
# Find duplicate rows in the DataFrame
duplicate_rows = target_emissions_table[target_emissions_table.duplicated()]

# Display the duplicate rows
print(duplicate_rows)

duplicate_rows[duplicate_rows['Organization_id'] == 60416]

     Organization_id  pct_reduction_target target_year Target_boundary sector  \
328            54102                  10.0        2030             NaN    NaN   
598            60416                   NaN         NaN             NaN    NaN   
599            60416                   NaN         NaN             NaN    NaN   

     Estimated_business_as_usual   type_of_target  
328                          NaN  Absolute target  
598                          NaN  Absolute target  
599                          NaN  Absolute target  


Unnamed: 0,Organization_id,pct_reduction_target,target_year,Target_boundary,sector,Estimated_business_as_usual,type_of_target
598,60416,,,,,,Absolute target
599,60416,,,,,,Absolute target


In [190]:
print(target_emissions_table[(target_emissions_table[org_no] == 54102) & (target_emissions_table[reduc_target] == 10.0) & (target_emissions_table[type_target] == 'Absolute target')])
print(target_emissions_table[(target_emissions_table[org_no] == 60416)])

# there are 1 duplicates for 54102 and 2 for 60416 with the index numbers 328, 598, 599

     Organization_id  pct_reduction_target target_year Target_boundary sector  \
325            54102                  10.0        2030             NaN    NaN   
328            54102                  10.0        2030             NaN    NaN   

     Estimated_business_as_usual   type_of_target  
325                          NaN  Absolute target  
328                          NaN  Absolute target  
     Organization_id  pct_reduction_target target_year Target_boundary sector  \
597            60416                   NaN         NaN             NaN    NaN   
598            60416                   NaN         NaN             NaN    NaN   
599            60416                   NaN         NaN             NaN    NaN   

     Estimated_business_as_usual   type_of_target  
597                          NaN  Absolute target  
598                          NaN  Absolute target  
599                          NaN  Absolute target  


In [191]:
# Assuming df is your DataFrame
target_emissions_table = target_emissions_table.drop(index=[328, 598, 599])

target_emissions_table = target_emissions_table.reset_index(drop=True)


# check after droping
print(target_emissions_table[(target_emissions_table[org_no] == 54102) & (target_emissions_table[reduc_target] == 10.0) & (target_emissions_table[type_target] == 'Absolute target')])
print(target_emissions_table[(target_emissions_table[org_no] == 60416)])

     Organization_id  pct_reduction_target target_year Target_boundary sector  \
325            54102                  10.0        2030             NaN    NaN   

     Estimated_business_as_usual   type_of_target  
325                          NaN  Absolute target  
     Organization_id  pct_reduction_target target_year Target_boundary sector  \
596            60416                   NaN         NaN             NaN    NaN   

     Estimated_business_as_usual   type_of_target  
596                          NaN  Absolute target  


#### Getting foreign key for the target emission table

In [192]:
# now need to have final_base_emission_df point to emission_id and remove Organization_id

# Assuming emission_df_updated has 'org_id' and 'emission_id' columns
emission_mapping = dfemission[['Organization_id', 'emission_id']].set_index('Organization_id')['emission_id']

# Map 'org_id' in base_emission_df to 'emission_id' using the mapping
# The 'map' function replaces each 'org_id' with its corresponding 'emission_id'
target_emissions_table['emission_id'] = target_emissions_table['Organization_id'].map(emission_mapping)

# drop Organization_id column
target_emissions_table.drop(columns=['Organization_id'], inplace=True)

target_emissions_table

Unnamed: 0,pct_reduction_target,target_year,Target_boundary,sector,Estimated_business_as_usual,type_of_target,emission_id
0,2.0,,,Total,,,264
1,25.0,2020.0,,Total,,,265
2,7.0,2020.0,,Total,,,236
3,35.0,2020.0,,Total,,,161
4,40.0,2030.0,Overall community emissions,Total,,,172
...,...,...,...,...,...,...,...
678,40.0,2030,,Total,,Absolute target,162
679,65.0,2040,,Total,,Absolute target,162
680,80.0,2050,,Total,,Absolute target,162
681,80.0,2050,,Other: Corporate,,Absolute target,162


In [193]:
# Adding the target_emission_id primary key
target_emissions_table.reset_index(drop=True, inplace=True)
target_emissions_table['target_emission_id'] = target_emissions_table.index + 1

final_target_emissions_df = target_emissions_table[['target_emission_id', 'pct_reduction_target', 'target_year', 'Target_boundary', 'Estimated_business_as_usual', 'emission_id', 'sector', 'type_of_target']]

final_target_emissions_df

Unnamed: 0,target_emission_id,pct_reduction_target,target_year,Target_boundary,Estimated_business_as_usual,emission_id,sector,type_of_target
0,1,2.0,,,,264,Total,
1,2,25.0,2020.0,,,265,Total,
2,3,7.0,2020.0,,,236,Total,
3,4,35.0,2020.0,,,161,Total,
4,5,40.0,2030.0,Overall community emissions,,172,Total,
...,...,...,...,...,...,...,...,...
678,679,40.0,2030,,,162,Total,Absolute target
679,680,65.0,2040,,,162,Total,Absolute target
680,681,80.0,2050,,,162,Total,Absolute target
681,682,80.0,2050,,,162,Other: Corporate,Absolute target


In [194]:
# Check to see if there are rows where reduc_target, target_year, target_bound, sector, estimated are all NaN

rows_with_all_nan = final_target_emissions_df[[reduc_target, target_year, target_bound, sector, estimated]].isna().all(axis=1)

# To see if there are any such rows
any_rows_with_all_nan = rows_with_all_nan.any()

print(f"Are there any rows where columns reduc_target, target_year, target_bound, sector, estimated, type_target are all NaN? {any_rows_with_all_nan}")

print("Rows where columns reduc_target, target_year, target_bound, sector, estimated, type_target are all NaN:")
final_target_emissions_df[rows_with_all_nan]


Are there any rows where columns reduc_target, target_year, target_bound, sector, estimated, type_target are all NaN? True
Rows where columns reduc_target, target_year, target_bound, sector, estimated, type_target are all NaN:


Unnamed: 0,target_emission_id,pct_reduction_target,target_year,Target_boundary,Estimated_business_as_usual,emission_id,sector,type_of_target
287,288,,,,,217,,Absolute target
290,291,,,,,290,,Absolute target
303,304,,,,,24,,Base year intensity target
305,306,,,,,154,,Baseline scenario (business as usual) target
308,309,,,,,129,,Absolute target
349,350,,,,,241,,Baseline scenario (business as usual) target
364,365,,,,,230,,Baseline scenario (business as usual) target
369,370,,,,,241,,Base year intensity target
380,381,,,,,292,,Absolute target
381,382,,,,,292,,Baseline scenario (business as usual) target


In [195]:
final_target_emissions_df = final_target_emissions_df[~rows_with_all_nan]
final_target_emissions_df[rows_with_all_nan]

  final_target_emissions_df[rows_with_all_nan]


Unnamed: 0,target_emission_id,pct_reduction_target,target_year,Target_boundary,Estimated_business_as_usual,emission_id,sector,type_of_target


In [196]:
# Separating out the type of target table

final_target_type_df =  final_target_emissions_df[['type_of_target', 'target_emission_id']]
final_target_type_df

Unnamed: 0,type_of_target,target_emission_id
0,,1
1,,2
2,,3
3,,4
4,,5
...,...,...
677,Absolute target,678
678,Absolute target,679
679,Absolute target,680
680,Absolute target,681


In [197]:
# drop all rows wehre type_of_target is NaN
final_target_type_df = final_target_type_df.dropna(subset=['type_of_target'])

print(final_target_type_df['type_of_target'].unique())
final_target_type_df

['Absolute target' 'Baseline scenario (business as usual) target'
 'Base year intensity target']


Unnamed: 0,type_of_target,target_emission_id
280,Absolute target,281
281,Absolute target,282
282,Absolute target,283
283,Absolute target,284
284,Absolute target,285
...,...,...
677,Absolute target,678
678,Absolute target,679
679,Absolute target,680
680,Absolute target,681


In [198]:
# add primary keys
# Adding the target_emission_id primary key
final_target_type_df.reset_index(drop=True, inplace=True)
final_target_type_df['type_of_target_id'] = final_target_type_df.index + 1

final_target_type_df = final_target_type_df[['type_of_target_id', 'type_of_target', 'target_emission_id']]

final_target_type_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_target_type_df['type_of_target_id'] = final_target_type_df.index + 1


Unnamed: 0,type_of_target_id,type_of_target,target_emission_id
0,1,Absolute target,281
1,2,Absolute target,282
2,3,Absolute target,283
3,4,Absolute target,284
4,5,Absolute target,285
...,...,...,...
368,369,Absolute target,678
369,370,Absolute target,679
370,371,Absolute target,680
371,372,Absolute target,681


In [199]:
# drop the type of target column from final_target_emissions_df

final_target_emissions_df = final_target_emissions_df.drop(columns=['type_of_target'])

final_target_emissions_df

Unnamed: 0,target_emission_id,pct_reduction_target,target_year,Target_boundary,Estimated_business_as_usual,emission_id,sector
0,1,2.0,,,,264,Total
1,2,25.0,2020.0,,,265,Total
2,3,7.0,2020.0,,,236,Total
3,4,35.0,2020.0,,,161,Total
4,5,40.0,2030.0,Overall community emissions,,172,Total
...,...,...,...,...,...,...,...
677,678,10.0,2020,,,162,Total
678,679,40.0,2030,,,162,Total
679,680,65.0,2040,,,162,Total
680,681,80.0,2050,,,162,Total


In [200]:
# Below to replace all \n with space character, the commented section below can be used to check

final_target_emissions_df['Target_boundary'] = final_target_emissions_df['Target_boundary'].str.replace('\n', ' ', regex=False)


# Check if the specified column contains '\n'
contains_newline = final_target_emissions_df['Target_boundary'].str.contains('\n', regex=False, na=False)

# To see if any value contains '\n'
any_contains_newline = contains_newline.any()

print(f"Does the column contain values with '\\n'? {any_contains_newline}")

# If you want to see the rows where this condition is true
if any_contains_newline:
    print("Rows where the column contains '\\n':")
    print(final_target_emissions_df[contains_newline])


Does the column contain values with '\n'? False


In [201]:
final_target_emissions_df.to_csv('DBassign1/target_emission_table.csv', sep=';', index=False)
final_target_type_df.to_csv('DBassign1/type_of_target_table.csv', sep=';', index=False)