In [28]:
import pandas

import pygsheets
import numpy
import re

# import data directly from google sheets

In [2]:
gc = pygsheets.authorize(service_account_env_var='GDRIVE_API_CREDENTIALS')
#spreadsheet = gc.open_by_key('1foPLE6K-uqFlaYgLPAUxzeXfDO5wOOqE7tibNHeqTek')
spreadsheet = gc.open_by_key('1VnhD3K8bUn-CwTGUt-XkF42jcWAaZwasKIryEO7V1j4') # May 2024 release

gas_pipes = spreadsheet.worksheet('title','Gas pipelines').get_as_df(start='A3')
oil_pipes = spreadsheet.worksheet('title', 'Oil/NGL pipelines').get_as_df(start='A3')

pipes_df_orig = oil_pipes.copy() #pandas.concat([gas_pipes, oil_pipes], ignore_index=True)

#get country ratios sheet
country_ratios_df = spreadsheet.worksheet('title', 'Country ratios by pipeline').get_as_df()
region_df_orig = spreadsheet.worksheet('title', 'Country dictionary').get_as_df(start='A2')

In [3]:
fuel_type = 'Oil'

In [4]:
gas_fuel_options = ['Gas']
ngl_fuel_options = ['NGL', 
                    'NGL, oil products', 
                    'Oil, NGL', 
                    'Oil, NGL, naphtha']
oil_fuel_options = ['Oil', 
                    'Oil, NGL', 
                    'Oil, NGL, naphtha']

In [5]:
owners_df_orig = spreadsheet.worksheet('title', 'Pipeline operators/owners (1/3)').get_as_df(start='A2')
owners_df_orig = owners_df_orig.loc[owners_df_orig.ProjectID!='']
owners_df_orig = owners_df_orig.loc[owners_df_orig.Wiki!='']
owners_df = owners_df_orig.replace('',numpy.nan)

owner_parent_links_df = spreadsheet.worksheet('title', 'Owner–parent relationships (2/3)').get_as_df(start='A2')
# only keep the owners with a checked relationship
owner_parent_links_df = owner_parent_links_df.loc[owner_parent_links_df['Parent–Owner Relationship Checked?']=='yes']
owner_parent_links_df.replace('',numpy.nan,inplace=True)

parents_df = spreadsheet.worksheet('title', 'Parent metadata (3/3)').get_as_df(start='A2')
parents_df = parents_df.loc[parents_df.Parent!='']

owners_df.set_index('ProjectID', inplace=True)
owner_parent_links_df.set_index('Owner', inplace=True)
parents_df.set_index('Parent', inplace=True)

# ****************************************

## create list of owner and parent column names
owner_pct_col_names = []
owner_col_names = []

parent_pct_col_names = []
parent_col_names = []

for num in range(1,11+1):
    owner_pct_col = f'Owner{num}%'
    owner_pct_col_names.append(owner_pct_col)
    
    owner_col = f'Owner{num}'
    owner_col_names.append(owner_col)
    
    parent_pct_col = f'Parent{num}%'
    parent_pct_col_names.append(parent_pct_col)
    
    parent_col = f'Parent{num}'
    parent_col_names.append(parent_col)

# ****************************************
## fill in missing parent info by borrowing owner info
owners_FULL_set = owners_df[owner_col_names].stack().dropna().unique().tolist() # from owners_df
owners_researched_set = list(set(owner_parent_links_df.index.to_list()))#+['Unknown'] # only existing owners, plus 'Unknown'
owners_diff = list(set(owners_FULL_set)-set(owners_researched_set))
owners_diff.append('Unknown')

# update owner_parent_links_df with these extra owners
owner_parent_links_df = pandas.concat([owner_parent_links_df, pandas.DataFrame(index=owners_diff, columns=owner_parent_links_df.columns)])
owner_parent_links_df['Parent1'].loc[owners_diff] = owners_diff
owner_parent_links_df['Parent1%'].loc[owners_diff] = '100.00%'

# ****************************************
# update parents_df with these as well
# note countries will be unknkown...
parents_set = list(set(parents_df.index.to_list()))
parents_diff = list(set(owners_diff)-set(parents_set))
parents_diff.append('Unknown')
parents_df = pandas.concat([parents_df, pandas.DataFrame(numpy.nan, index=parents_diff, columns=parents_df.columns)])
parents_df.loc[parents_diff,'ParentHQCountry'] = 'Unknown'

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  owner_parent_links_df['Parent1'].loc[owners_diff] = owners_diff
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update t

In [19]:
status_list = ['proposed', 
               'construction', 
               'shelved', 
               'cancelled', 
               'operating', 
               'idle', 
               'mothballed', 
               'retired']
country_list = sorted(set(region_df_orig['Country'].tolist()))
region_list = sorted(set(region_df_orig['Region'].tolist()))
if '--' in region_list:
    region_list.remove('--')
subregion_list = sorted(set(region_df_orig['SubRegion'].tolist()))
if '--' in subregion_list:
    subregion_list.remove('--')

## replace "--" with NaN, removing empty rows

the dataset is structured to have -- wherever there's a lookup value that doesn't exist; replacing it with NaN (numpy.nan) allows pandas to treat it as a null value, which makes calculations much easier

In [20]:
# replace -- entries with NaN
pipes_df_orig = pipes_df_orig.replace('--', numpy.nan)
pipes_df_orig = pipes_df_orig[pipes_df_orig['PipelineName']!='']

missing_wiki_projectids = pipes_df_orig.loc[pipes_df_orig.Wiki==''].ProjectID.tolist()
pipes_df_orig = pipes_df_orig[pipes_df_orig['RouteAccuracy']!='']

country_ratios_df.replace('--', numpy.nan, inplace=True)
country_ratios_df = country_ratios_df.loc[~country_ratios_df.ProjectID.isin(missing_wiki_projectids)]

# km by country, km by region calculations

In [21]:
dict_subregion_region = pandas.Series(region_df_orig.Region.values, index=region_df_orig.SubRegion).to_dict()
#dict_subregion_region

In [22]:
region_df_orig_cleaned = region_df_orig.loc[(region_df_orig.Region!='--')&
                                            (region_df_orig.SubRegion!='--')]
multiindex_region_subregion = region_df_orig_cleaned.groupby(['Region','SubRegion'])['Country'].count().index
multiindex_region_subregion_country = region_df_orig_cleaned.groupby(['Region','SubRegion','Country'])['Country'].count().index

In [23]:
country_ratios_fuel_df = country_ratios_df[country_ratios_df.Fuel.isin(oil_fuel_options)]

km_by_country_df = pandas.DataFrame(columns=status_list, index=country_list)
km_by_subregion_df = pandas.DataFrame(columns=status_list, index=multiindex_region_subregion)
km_by_region_df = pandas.DataFrame(columns=status_list, index=region_list)

print('===country-level calculations===')
for status in status_list:
    print(status)
    country_ratios_fuel_df_status = country_ratios_fuel_df[country_ratios_fuel_df['Status']==status]
    km_by_country_df[status] = country_ratios_fuel_df_status.groupby('Country')['LengthMergedKmByCountry'].sum()
    km_by_subregion_df[status] = country_ratios_fuel_df_status.groupby(['Region','SubRegion'])['LengthMergedKmByCountry'].sum()
    km_by_region_df[status] = country_ratios_fuel_df_status.groupby('Region')['LengthMergedKmByCountry'].sum()

# # fill NaN with 0.0
km_by_subregion_df = km_by_subregion_df.fillna(0)
km_by_country_df = km_by_country_df.fillna(0)
km_by_region_df = km_by_region_df.fillna(0)

#km_by_region_df.sort_index(level='Region', inplace=True)
#km_by_region_df = km_by_region_df.loc[~(km_by_region_df==0).all(axis=1)]

# total
# km_by_region_df.loc['Total',:] = km_by_region_df.sum(axis=0).values
# km_by_country_df.loc['Total',:] = km_by_country_df.sum(axis=0).values

km_by_subregion_df['proposed+construction'] = km_by_subregion_df[['proposed','construction']].sum(axis=1)
km_by_subregion_df = km_by_subregion_df[['proposed', 'construction', 'proposed+construction', 'shelved', 'cancelled', 'operating', 'idle', 'mothballed', 'retired']]

km_by_country_df['proposed+construction'] = km_by_country_df[['proposed','construction']].sum(axis=1)
km_by_country_df.sort_values('proposed+construction', ascending=False, inplace=True)
km_by_country_df = km_by_country_df.loc[~(km_by_country_df==0).all(axis=1)]
km_by_country_df.loc[:,'Region'] = region_df_orig.set_index('Country').loc[km_by_country_df.index.tolist()].Region
km_by_country_df.loc[:,'Subregion'] = region_df_orig.set_index('Country').loc[km_by_country_df.index.tolist()].SubRegion
km_by_country_df = km_by_country_df[['Region','Subregion','proposed', 'construction', 'proposed+construction', 'shelved', 'cancelled', 'operating', 'idle', 'mothballed', 'retired']]
km_by_country_df = km_by_country_df.loc[(km_by_country_df.Region!='--')&
                                        (km_by_country_df.Subregion!='--')]

km_by_region_df['proposed+construction'] = km_by_region_df[['proposed','construction']].sum(axis=1)
km_by_region_df = km_by_region_df[['proposed', 'construction', 'proposed+construction', 'shelved', 'cancelled', 'operating', 'idle', 'mothballed', 'retired']]

km_by_subregion_df.index.set_names(['Region','Subregion'], inplace=True)
km_by_subregion_df.loc['Total',:] = km_by_subregion_df.sum(axis=0).values
km_by_subregion_df.replace(0,'', inplace=False)

===country-level calculations===
proposed
construction
shelved
cancelled
operating
idle
mothballed
retired


Unnamed: 0_level_0,Unnamed: 1_level_0,proposed,construction,proposed+construction,shelved,cancelled,operating,idle,mothballed,retired
Region,Subregion,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Africa,Northern Africa,68.5,100.0,168.5,488.0,620.04,16395.8,,,
Africa,Sub-Saharan Africa,6482.26,1950.0,8432.26,498.13,2100.0,7457.94,,,
Americas,Latin America and the Caribbean,873.0,525.0,1398.0,,3937.83,26755.13,,,
Americas,Northern America,1255.34,214.38,1469.72,6481.75,34111.89,111353.42,,,6453.81
Asia,Central Asia,,,,,2048.97,9207.62,,765.56,
Asia,Eastern Asia,4467.67,476.28,4943.95,,7144.88,28999.58,46.35,,7601.15
Asia,South-eastern Asia,228.02,,228.02,,306.0,1754.52,,,
Asia,Southern Asia,69.47,6184.0,6253.47,,6780.04,24864.24,,,
Asia,Western Asia,6898.35,175.0,7073.35,1544.61,2108.88,21890.4,837.0,3208.28,1785.54
Europe,Eastern Europe,1510.34,1237.84,2748.18,,4893.7,55185.79,2376.0,1621.29,1401.9


In [24]:
#km_by_country_df.to_excel('km-by-country-region-subregion.xlsx')
km_by_country_df

Unnamed: 0,Region,Subregion,proposed,construction,proposed+construction,shelved,cancelled,operating,idle,mothballed,retired
China,Asia,Eastern Asia,3880.37,476.28,4356.65,0.00,7144.88,28951.54,46.35,0.00,7601.15
Iraq,Asia,Western Asia,3755.66,75.00,3830.66,0.00,898.80,5241.73,0.00,935.15,0.00
Iran,Asia,Southern Asia,57.00,2920.00,2977.00,0.00,1536.40,14474.28,0.00,0.00,0.00
India,Asia,Southern Asia,0.00,2824.00,2824.00,0.00,1338.00,9254.17,0.00,0.00,0.00
Syria,Asia,Western Asia,2140.41,0.00,2140.41,0.00,0.00,169.51,0.00,854.17,217.11
...,...,...,...,...,...,...,...,...,...,...,...
Ethiopia,Africa,Sub-Saharan Africa,0.00,0.00,0.00,498.13,0.00,0.00,0.00,0.00,0.00
France,Europe,Western Europe,0.00,0.00,0.00,0.00,0.00,4696.81,257.61,0.00,0.00
Gabon,Africa,Sub-Saharan Africa,0.00,0.00,0.00,0.00,0.00,475.00,0.00,0.00,0.00
Georgia,Asia,Western Asia,0.00,0.00,0.00,0.00,0.00,247.54,386.72,0.00,0.00


# save km by country for dashboard

In [25]:
missing_countries = list(set(region_df_orig.Country.tolist())-set(km_by_country_df.index.tolist()))
# add empty missing rows so geometries will still be plotted in Flourish
for cntry in missing_countries:
    km_by_country_df.loc[cntry] = 0

# add 3 letter code for Flourish map
km_by_country_df['CountryISO3166-1alpha-3'] = region_df_orig.set_index('Country').loc[km_by_country_df.index,'CountryISO3166-1alpha-3']
km_by_country_df['Region'] = region_df_orig.set_index('Country').loc[km_by_country_df.index,'Region']
km_by_country_df['Subregion'] = region_df_orig.set_index('Country').loc[km_by_country_df.index,'SubRegion']
km_by_country_df.sort_values('construction', ascending=False).drop('Antarctica').replace(0,'').to_excel('km-by-country-region-subregion-sorted-by-construction.xlsx')
#km_by_country_df.sort_values('construction', ascending=False).drop('Antarctica').to_excel('km-by-country-region-subregion-sorted-by-construction.xlsx')

In [26]:
km_by_country_df

Unnamed: 0,Region,Subregion,proposed,construction,proposed+construction,shelved,cancelled,operating,idle,mothballed,retired,CountryISO3166-1alpha-3
China,Asia,Eastern Asia,3880.37,476.28,4356.65,0.0,7144.88,28951.54,46.35,0.00,7601.15,CHN
Iraq,Asia,Western Asia,3755.66,75.00,3830.66,0.0,898.80,5241.73,0.00,935.15,0.00,IRQ
Iran,Asia,Southern Asia,57.00,2920.00,2977.00,0.0,1536.40,14474.28,0.00,0.00,0.00,IRN
India,Asia,Southern Asia,0.00,2824.00,2824.00,0.0,1338.00,9254.17,0.00,0.00,0.00,IND
Syria,Asia,Western Asia,2140.41,0.00,2140.41,0.0,0.00,169.51,0.00,854.17,217.11,SYR
...,...,...,...,...,...,...,...,...,...,...,...,...
Belize,Americas,Latin America and the Caribbean,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,BLZ
Tuvalu,Oceania,Polynesia,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,TUV
Guernsey,Europe,Northern Europe,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,GGY
Liechtenstein,Europe,Western Europe,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,LIE


# parent analysis

In [29]:
owner_parent_calculations_df = pandas.DataFrame()
# needs country, km in each country columns as well

for idx,row in country_ratios_df.loc[(country_ratios_df.Fuel.isin(oil_fuel_options))&
                                (country_ratios_df.Status.isin(['proposed','construction']))].iterrows():
    parent_string = pipes_df_orig.loc[pipes_df_orig.ProjectID==row.ProjectID].Parent.values[0]
    #print(parent_string)
    if parent_string == '--':
        parent_string = 'Unknown [unknown %]'
    parent_list = re.sub(' \[.*?\]', '', parent_string).split('; ') # all entries must have an "Owner [%]" syntax, so [unknown %] is included
    percent_list = [float(i.rstrip('%'))/100. for i in re.findall('\\d+(?:\\.\\d+)?%', parent_string)]
    #print(percent_list)

    if parent_list.__len__()!=percent_list.__len__():
        if percent_list==[]:
            percent_list = [1/parent_list.__len__() for i in parent_list]
        else:
            nmissing = parent_list.__len__()-percent_list.__len__()
            # distribute nans evenly
            total = numpy.nansum(percent_list)
            leftover = 1-total
            percent_list += [leftover/nmissing]*nmissing

    # for when database isn't completely filled in for owners:
    for p_idx,parent in enumerate(parent_list):
        if parent not in parents_df.index.to_list():
            parents_df.loc[parent,'ParentHQCountry']='Unknown'
        owner_parent_calculations_df = pandas.concat([owner_parent_calculations_df, 
                                                      pandas.DataFrame([{'Parent':parent, 'ProjectID':row.ProjectID, 
                                                                         'FractionOwnership':percent_list[p_idx],
                                                                         'ParentHQCountry':parents_df.loc[parents_df.index==parent]['ParentHQCountry'].values[0],
                                                                         'PipelineCountry':row.Country,
                                                                         'Status':row.Status,
                                                                         'LengthMergedKmByCountry':row.LengthMergedKmByCountry,
                                                                         'LengthMergedKmByPipeline':row.LengthMergedKmByPipeline,
                                                                         'CapacityBOEd':row.CapacityBOEd}])])

owner_parent_calculations_df['KmOwnershipByCountry'] = owner_parent_calculations_df.FractionOwnership*owner_parent_calculations_df.LengthMergedKmByCountry

## print out leading parent companies

In [30]:
parent_country_km_df = \
owner_parent_calculations_df.loc[#(owner_parent_calculations_df.PipelineCountry.isin(region_df_touse.Country.tolist()))&
                                 (owner_parent_calculations_df.Status.isin(['proposed','construction']))]

parent_country_km_df_table = pandas.DataFrame(index=parent_country_km_df.Parent.unique().tolist(),
                                             columns=['Pipeline Country Locations','Owner Headquarters Country',
                                                      'proposed','construction','In Development (proposed + construction)','ProjectIDs'])

parent_country_km_df_table['proposed'] = parent_country_km_df.loc[parent_country_km_df.Status=='proposed'].groupby('Parent').sum('KmOwnershipByCountry')[['KmOwnershipByCountry']]
parent_country_km_df_table['construction'] = parent_country_km_df.loc[parent_country_km_df.Status=='construction'].groupby('Parent').sum('KmOwnershipByCountry')[['KmOwnershipByCountry']]

parent_country_km_df_table = parent_country_km_df_table.loc[parent_country_km_df_table.index!='']

In [31]:
# top n parents
n=20

In [44]:
country_agg_df = pandas.DataFrame(owner_parent_calculations_df.groupby(['Parent']).PipelineCountry.agg(lambda x: ', '.join(x.unique())))
projectid_agg_df = pandas.DataFrame(owner_parent_calculations_df.groupby(['Parent']).ProjectID.agg(lambda x: ', '.join(x.unique())))
parent_country_km_df_table['Pipeline Country Locations'] = country_agg_df.PipelineCountry
parent_country_km_df_table['Number of Projects'] = owner_parent_calculations_df.groupby(['Parent']).ProjectID.count()
parent_country_km_df_table['ProjectIDs'] = projectid_agg_df.ProjectID
parent_country_km_df_table['Owner Headquarters Country'] = parent_country_km_df.drop_duplicates('Parent').set_index('Parent').ParentHQCountry
parent_country_km_df_table['In Development (proposed + construction)'] = parent_country_km_df_table[['proposed','construction']].sum(axis=1, min_count=0)
#parent_country_km_df_table.loc[~parent_country_km_df_table.index.isin(['Not in database','Unknown'])].replace(numpy.nan, 0).sort_values('In Development (proposed + construction)', ascending=False)[:n].to_excel('parent-country-km-table.xlsx')
parent_country_km_df_table.loc[~parent_country_km_df_table.index.isin(['Not in database','Unknown'])].replace(numpy.nan, 0).sort_values('In Development (proposed + construction)', ascending=False)[:n]


Unnamed: 0,Pipeline Country Locations,Owner Headquarters Country,proposed,construction,In Development (proposed + construction),ProjectIDs,Number of Projects
unknown,"China, Angola, Zambia, Syria, Iraq, Iran, Unit...",unknown,4064.21,525.98,4590.19,"P5176, P5177, P5178, P5239, P5277, P5278, P527...",16
Iraq Ministry of Oil,"Jordan, Iraq, Syria",Iraq,4053.27,75.0,4128.27,"P0544, P3874, P3875, P5240, P5284, P5285, P5291",11
Iran Ministry of Petroleum,Iran,Iran,57.0,2463.0,2520.0,"P2221, P2222, P2226, P2229, P3848, P5287, P533...",19
China National Petroleum Corporation,"Benin, Nigeria, Niger",China,0.0,1950.0,1950.0,P1328,3
"National Petroleum and Natural Gas Pipeline Network Group Co., Ltd.",China,China,1724.7886,0.0,1724.7886,"P2037, P3750, P3770, P6095, P6096, P6270, P6298",7
Numaligarh Refinery Limited,India,India,0.0,1630.0,1630.0,P3843,1
TotalEnergies,"South Sudan, Kenya, Tanzania, Uganda",France,1377.345,0.0,1377.345,"P0531, P0538, P0541",5
Indian Oil Corporation,India,India,0.0,1194.0,1194.0,P3889,1
Government of Zambia,"Zambia, Tanzania",Zambia,1140.57,0.0,1140.57,P2481,2
"Hongrun Petrochemical Co., Ltd.","Japan, China, South Korea",China,986.67,0.0,986.67,P6185,3


# fluvial owner plot for dashboard

In [45]:
top_n_parents_df = parent_country_km_df_table.drop('unknown').sort_values('In Development (proposed + construction)', ascending=False)[:n]

In [46]:
top_n_parents_df['In Development (proposed + construction)'].sum()/parent_country_km_df_table['In Development (proposed + construction)'].sum()

0.7062533525385829

In [47]:
top_n_parents_list = top_n_parents_df.index.tolist()

In [48]:
owner_parent_calculations_df.loc[(owner_parent_calculations_df.Parent.isin(top_n_parents_list))&
                                (owner_parent_calculations_df.Status.isin(['proposed','construction']))][['Parent',
                                                                                                          'ParentHQCountry',
                                                                                                          'PipelineCountry',
                                                                                                          'Status',
                                                                                                          'LengthMergedKmByCountry']].replace({'proposed':'Proposed','construction':'Construction'}).to_excel(f'parent-fluvial-diagram-top-{n}.xlsx')

# over X km parents list

In [None]:
km_threshold = 500
temp_df = owner_parent_calculations_df.groupby('Parent')[['KmOwnershipByCountry']].sum()
threshold_km_parents_df = temp_df.loc[temp_df.KmOwnershipByCountry>km_threshold]
threshold_km_parents_list = threshold_km_parents_df.drop('unknown').index.tolist()
threshold_km_parents_list.__len__()

In [None]:
threshold_km_parents_df

In [None]:
owner_parent_calculations_df.loc[(owner_parent_calculations_df.Parent.isin(threshold_km_parents_list))&
                                (owner_parent_calculations_df.Status.isin(['proposed','construction']))][['Parent',
                                                                                                          'ParentHQCountry',
                                                                                                          'PipelineCountry',
                                                                                                          'Status',
                                                                                                          'LengthMergedKmByCountry']].replace({'proposed':'Proposed','construction':'Construction'}).to_excel(f'parent-building-over-{str(km_threshold)}km.xlsx')

In [None]:
threshold_km_parents_df.drop('unknown').sum()/parent_country_km_df_table['In Development (proposed + construction)'].sum()

# world connections globe

In [None]:
owner_parent_connections_df = owner_parent_calculations_df.copy().reset_index(drop=True)#.to_excel('owner-parent-calculations-df.xlsx')
owner_parent_connections_df = owner_parent_connections_df.loc[owner_parent_connections_df.ParentHQCountry!='unknown']
#owner_parent_connections_df['HQ_3letter'] = numpy.nan
owner_parent_connections_df['HQ_3letter'] = region_df_orig.set_index('Country').loc[
           owner_parent_connections_df.ParentHQCountry.tolist(), 'CountryISO3166-1alpha-3'].values
owner_parent_connections_df['PipelineCountry_3letter'] = region_df_orig.set_index('Country').loc[
           owner_parent_connections_df.PipelineCountry.tolist(), 'CountryISO3166-1alpha-3'].values

In [None]:
owner_parent_connections_df.to_excel('owner-parent-connections-dataframe.xlsx')

# cost estimates (pipeline cost per km)

## pick out high and low quantiles

In [None]:
temp_df = pipes_df_orig.loc[(~pipes_df_orig.CostUSDPerKm.isnull())&
                            (pipes_df_orig.Fuel.isin(oil_fuel_options))]
qlo_val = 0.025
qhi_val = 0.975

q_lo=temp_df['CostUSDPerKm'].quantile(qlo_val)
q_hi=temp_df['CostUSDPerKm'].quantile(qhi_val)
print(temp_df['CostUSDPerKm'].quantile(qlo_val))
print(temp_df['CostUSDPerKm'].quantile(qhi_val))

temp_df = temp_df.loc[temp_df['CostUSDPerKm'].between(q_lo, q_hi, inclusive='neither')]

In [None]:
# pull out only pipelines that have a KNOWN length AND a cost
country_ratios_with_length_and_cost_df = country_ratios_df.loc[(country_ratios_df.Fuel.isin(oil_fuel_options)) & 
                                                               (country_ratios_df['CostUSDPerKm'].notna()) & 
                                                               (country_ratios_df['LengthKnownKmByCountry'].notna()) #&
                                                               #(country_ratios_df['LengthKnownKm']!=0) &
                                                               #(country_ratios_df['CostUSDPerKm']<10e6)
                                                              ]

country_ratios_with_length_and_cost_df = country_ratios_with_length_and_cost_df.loc[
    country_ratios_with_length_and_cost_df['CostUSDPerKm'].between(q_lo, q_hi, inclusive='neither')]
#country_ratios_with_length_and_cost_df = country_ratios_with_length_and_cost_df[~country_ratios_with_length_and_cost_df.ProjectID.isin(outliers_projectids)]

In [None]:
country_ratios_df.loc[(country_ratios_df.Fuel.isin(oil_fuel_options)) & 
                    (country_ratios_df['CostUSDPerKm'].notna()) & 
                    (country_ratios_df['LengthKnownKmByCountry'].notna())].shape

In [None]:
country_ratios_df.loc[(country_ratios_df.Fuel.isin(oil_fuel_options))].shape

### global mean value

In [None]:
global_mean = country_ratios_with_length_and_cost_df['CostUSDPerKm'].drop_duplicates().mean()
country_ratios_with_length_and_cost_df['CostUSDPerKm'].drop_duplicates().mean()

### calculate regional costs

In [None]:
region_list

In [None]:
pipes_costs_region_df = pandas.DataFrame(0, index=region_list, columns=['CostUSDPerKm','DataPoints'])#,'NumberOfLengths'])

for region in region_list:
    country_ratios_region_df = country_ratios_with_length_and_cost_df.loc[country_ratios_with_length_and_cost_df['Region']==region,:]
    pipes_costs_region_df.loc[region,'CostUSDPerKm'] = country_ratios_region_df['CostUSDPerKm'].mean()
    pipes_costs_region_df.loc[region,'DataPoints'] = list(set(country_ratios_region_df['ProjectID'])).__len__()
    

In [None]:
pipes_costs_subregion_df = pandas.DataFrame(0, index=subregion_list, columns=['CostUSDPerKm','DataPoints'])#,'NumberOfLengths'])

for subregion in subregion_list:
    country_ratios_subregion_df = country_ratios_with_length_and_cost_df.loc[country_ratios_with_length_and_cost_df['SubRegion']==subregion,:]
    n_datapoints = list(set(country_ratios_subregion_df['ProjectID'])).__len__()
    if n_datapoints < 3:
        pipes_costs_subregion_df.loc[subregion,'DataPoints'] = list(set(country_ratios_subregion_df['ProjectID'])).__len__()
        pipes_costs_subregion_df.loc[subregion,'CostUSDPerKm'] = pipes_costs_region_df.loc[dict_subregion_region[subregion],'CostUSDPerKm']
    else:
        pipes_costs_subregion_df.loc[subregion,'DataPoints'] = list(set(country_ratios_subregion_df['ProjectID'])).__len__()
        pipes_costs_subregion_df.loc[subregion,'CostUSDPerKm'] = country_ratios_subregion_df['CostUSDPerKm'].mean()

pipes_costs_subregion_df.sort_values('CostUSDPerKm', ascending=False)

In [None]:
pipes_costs_region_df

In [None]:
country_ratios_region_df['CostUSDPerKm'].mean()

In [None]:
pandas.options.display.float_format = '{:,.3f}'.format
temp_df = pipes_costs_region_df.copy()
temp_df['CostUSDPerKm'] = temp_df['CostUSDPerKm']/1e6
temp_df.sort_values('CostUSDPerKm', ascending=False)#.loc[region]['CostUSDPerKm']

In [None]:
pandas.options.display.float_format = '{:,.3f}'.format
temp_df = pipes_costs_subregion_df.copy()
temp_df['CostUSDPerKm'] = temp_df['CostUSDPerKm']/1e6
temp_df.sort_values('CostUSDPerKm', ascending=False)#.loc[region]['CostUSDPerKm']

# tables etc.

## table for stranded asset calculations

## country-level capex estimates

In [None]:
country_ratios_df_specific_fuel = country_ratios_df.loc[country_ratios_df.Fuel.isin(oil_fuel_options)]
country_ratios_df_specific_fuel.reset_index(drop=True, inplace=True)
country_ratios_df_specific_fuel.loc[:,'CostUSDEstimate'] = numpy.nan

for idx,row in country_ratios_df_specific_fuel.iterrows():
    # calculate cost
    cntry = row.Country
    #region = row.Region
    subregion = row.SubRegion
    km_by_cntry = row.LengthMergedKmByCountry
    #country_ratios_df.loc[idx,'CostUSDEstimate'] = pipes_costs_region_df.loc[region, 'CostUSDPerKm'] * km_by_cntry
    country_ratios_df_specific_fuel.loc[idx,'CostUSDEstimate'] = pipes_costs_subregion_df.loc[subregion, 'CostUSDPerKm'] * km_by_cntry

# replace any known costs now
country_ratios_df_specific_fuel.loc[(~country_ratios_df_specific_fuel.LengthKnownKmByCountry.isna())&
                    (~country_ratios_df_specific_fuel.CostUSDPerKm.isna()),'CostUSDEstimate'] = \
country_ratios_df_specific_fuel.loc[(~country_ratios_df_specific_fuel.LengthKnownKmByCountry.isna())&
                    (~country_ratios_df_specific_fuel.CostUSDPerKm.isna()), 'LengthMergedKmByCountry'] * \
country_ratios_df_specific_fuel.loc[(~country_ratios_df_specific_fuel.LengthKnownKmByCountry.isna())&
                    (~country_ratios_df_specific_fuel.CostUSDPerKm.isna()), 'CostUSDPerKm']

In [None]:
country_ratios_df_specific_fuel.SubRegion.unique()

In [None]:
capex_by_country_df = pandas.DataFrame(columns=status_list, index=country_list)
capex_by_region_df = pandas.DataFrame(columns=status_list, index=region_list)
capex_by_subregion_df = pandas.DataFrame(columns=status_list, index=multiindex_region_subregion)

print('===country-level calculations===')
for status in status_list:
    print(status)
    country_ratios_df_specific_fuel_status = country_ratios_df_specific_fuel.loc[country_ratios_df_specific_fuel.Status==status]
    country_ratios_df_specific_fuel_status = country_ratios_df_specific_fuel_status.loc[~country_ratios_df_specific_fuel_status.SubRegion.isnull()]
    capex_by_country_df[status] = country_ratios_df_specific_fuel_status.groupby('Country')['CostUSDEstimate'].sum()/1e9
    capex_by_region_df[status] = country_ratios_df_specific_fuel_status.groupby('Region')['CostUSDEstimate'].sum()/1e9
    capex_by_subregion_df[status] = country_ratios_df_specific_fuel_status.groupby(['Region','SubRegion'])['CostUSDEstimate'].sum()/1e9

# # fill NaN with 0.0
capex_by_region_df = capex_by_region_df.fillna(0)
capex_by_country_df = capex_by_country_df.fillna(0)
capex_by_subregion_df = capex_by_subregion_df.fillna(0)

capex_by_region_df['proposed+construction'] = capex_by_region_df[['proposed','construction']].sum(axis=1)
capex_by_region_df = capex_by_region_df[['proposed', 'construction', 'proposed+construction', 'shelved', 'cancelled', 'operating', 'idle', 'mothballed', 'retired']]
# capex_by_region_df.loc['Total',:] = capex_by_region_df.sum(axis=0).values

capex_by_country_df['proposed+construction'] = capex_by_country_df[['proposed','construction']].sum(axis=1)
capex_by_country_df = capex_by_country_df[['proposed', 'construction', 'proposed+construction', 'shelved', 'cancelled', 'operating', 'idle', 'mothballed', 'retired']]
#capex_by_country_df.sort_values('construction', ascending=False, inplace=True)
# capex_by_country_df.loc['Total',:] = capex_by_country_df.sum(axis=0).values

capex_by_subregion_df['proposed+construction'] = capex_by_subregion_df[['proposed','construction']].sum(axis=1)
capex_by_subregion_df = capex_by_subregion_df[['proposed', 'construction', 'proposed+construction', 'shelved', 'cancelled', 'operating', 'idle', 'mothballed', 'retired']]
# capex_by_subregion_df.loc['Total',:] = capex_by_subregion_df.sum(axis=0).values

# save capex by country for dashboard

In [None]:
missing_countries = list(set(region_df_orig.Country.tolist())-set(capex_by_country_df.index.tolist()))
# add empty missing rows so geometries will still be plotted in Flourish
for cntry in missing_countries:
    capex_by_country_df.loc[cntry] = 0

# add 3 letter code for Flourish map
capex_by_country_df['CountryISO3166-1alpha-3'] = region_df_orig.set_index('Country').loc[capex_by_country_df.index,'CountryISO3166-1alpha-3']
# add regions, subregions
capex_by_country_df['Region'] = region_df_orig.set_index('Country').loc[capex_by_country_df.index,'Region']
capex_by_country_df['Subregion'] = region_df_orig.set_index('Country').loc[capex_by_country_df.index,'SubRegion']

capex_by_country_df.sort_values('proposed+construction', ascending=False).to_excel('capex-by-country-sorted-by-in-dev.xlsx')

In [None]:
capex_by_country_df

In [None]:
capex_by_region_df.replace(0,'')

In [None]:
capex_by_subregion_df.index = capex_by_subregion_df.index.set_names(['Region','Subregion'])
capex_by_subregion_df.replace(0,'')

## sort into categories for capex map

In [None]:
capex_by_country_df['proposed+construction'].plot.hist(bins=100)

In [None]:
capex_by_country_df['proposed+construction'].describe()

In [None]:
#bins = [0,0.05,0.1,0.2,0.3,0.4,0.5,1,2,3,4,5,10,15,20,25,30]
bins = [0,1,5,10,20,30]
# create names for these bins, add an empty one up front
category_names = ['']+[f'US${str(bins[i])}–{str(bins[i+1])} billion' for i in range(bins.__len__()-1)]

# sort values into bins
bin_indices = numpy.digitize(capex_by_country_df['proposed+construction'].values, bins) # starts at 1

capex_by_country_df['proposed+construction bins'] = numpy.array(category_names)[bin_indices]

capex_by_country_df.sort_values('proposed+construction', ascending=False).to_excel('capex-by-country-sorted-by-in-dev-binned.xlsx')

# how much per year

In [None]:
pipes_started = pipes_df_orig.loc[(pipes_df_orig.Status=='operating') & 
                                    (pipes_df_orig.Fuel.isin(oil_fuel_options))]
pipes_started_sum = pipes_started.groupby('StartYearEarliest')['LengthMergedKm'].sum()


pipes_proposed = pipes_df_orig.loc[(pipes_df_orig.Status=='proposed') & 
                                    (pipes_df_orig.Fuel.isin(oil_fuel_options))]
pipes_proposed_sum = pipes_proposed.groupby('ProposalYear')['LengthMergedKm'].sum()


pipes_construction = pipes_df_orig.loc[(pipes_df_orig.Status=='construction') & 
                                    (pipes_df_orig.Fuel.isin(oil_fuel_options))]
pipes_construction_sum = pipes_construction.groupby('ConstructionYear')['LengthMergedKm'].sum()

In [None]:
if fuel_type == 'Gas':
    km_by_start_year = pandas.DataFrame(index=list(range(1980,2025)), columns=['Gas pipeline km'])
    km_by_start_year.index.name = 'Start year'
    km_by_start_year['Gas pipeline km'] = pipes_started_sum
    km_by_start_year.replace(numpy.nan,0,inplace=True)

if fuel_type == 'Oil':
    km_by_start_year = pandas.DataFrame(index=list(range(1980,2025)))
    km_by_start_year.index.name = 'Start year'
    km_by_start_year['Oil pipeline km operating'] = pipes_started_sum
    km_by_start_year['Oil pipeline km construction'] = pipes_construction_sum
    km_by_start_year['Oil pipeline km proposed'] = pipes_proposed_sum
    km_by_start_year.replace(numpy.nan,0,inplace=True)

if fuel_type == 'NGL':
    km_by_start_year = pandas.DataFrame(index=list(range(1980,2025)), columns=['NGL pipeline km'])
    km_by_start_year.index.name = 'Start year'
    km_by_start_year['NGL pipeline km'] = pipes_started_sum
    km_by_start_year.replace(numpy.nan,0,inplace=True)

km_by_start_year.loc['Total',:] = km_by_start_year.sum(axis=0)

km_by_start_year.to_excel('km-by-start-year.xlsx')
#km_by_start_year

# numbers for dashboard

In [None]:
# fraction of pipelines under construction compared to all in development
# this number is different from below because it's skipping some pipelines
km_by_country_total = km_by_country_df.sum(axis=0)
km_by_country_total['construction']/km_by_country_total['proposed+construction']

In [None]:
km_by_country_total.construction.sum()

In [None]:
country_ratios_fuel_df.columns

## total operating in world

In [None]:
country_ratios_fuel_df.loc[country_ratios_fuel_df.Status.isin(['operating'])].LengthMergedKmByCountry.sum()

## came online this past year?

In [None]:
country_ratios_fuel_df.loc[(country_ratios_fuel_df.Status.isin(['operating'])) &
                            (country_ratios_fuel_df.StartYearEarliest==2023)].LengthMergedKmByCountry.sum()

## in dev in past 5 years

In [None]:
country_ratios_fuel_df.loc[(country_ratios_fuel_df.Status.isin(['proposed','construction'])) &
                            (
                                country_ratios_fuel_df.ProposalYear.isin(list(range(2019,2024))) |
                                country_ratios_fuel_df.ConstructionYear.isin(list(range(2019,2024)))
                            )].LengthMergedKmByCountry.sum()

## shelved or cancelled in past 5 years

In [None]:
# shelved or cancelled in past 5 years (2019–2023)
country_ratios_fuel_df.loc[(country_ratios_fuel_df.Status.isin(['shelved','cancelled'])) &
                            (
                                country_ratios_fuel_df.ShelvedYear.isin(list(range(2019,2024))) |
                                country_ratios_fuel_df.CancelledYear.isin(list(range(2019,2024)))
                            )].LengthMergedKmByCountry.sum()