# Notes

code for producing GOIT pipelines summary stats, and for calculating landing page stats

this is saved as an Excel file, which Baird copies/pastes into the existing summary tables information on the drive here:
https://docs.google.com/spreadsheets/d/1OYH6D7c-D0FsL5GzBGijtkmvQCTkBUclj-UVoOieUFo/edit

In [1]:
import pandas
import numpy
import pygsheets
import datetime
import re
import pytz

In [2]:
# define the excel file to save tables in
current_time = datetime.datetime.now(pytz.timezone('US/Eastern')).strftime("%Y-%m-%d_T%H%M%S")

In [3]:
#fuel_type = 'Gas'
fuel_type = 'Oil'
#fuel_type = 'NGL'

## import data

In [4]:
gc = pygsheets.authorize(service_account_env_var='GDRIVE_API_CREDENTIALS')
#spreadsheet = gc.open_by_key('1WaBMIdfRWqSqXUw7_cKXo3RipyhPdnNN8flqEYfMZIA') # file to use for gas pipelines Dec 2023
spreadsheet = gc.open_by_key('1foPLE6K-uqFlaYgLPAUxzeXfDO5wOOqE7tibNHeqTek') # CURRENT sheet

gas_pipes = spreadsheet.worksheet('title', 'Gas pipelines').get_as_df(start='A3')
oil_pipes = spreadsheet.worksheet('title', 'Oil/NGL pipelines').get_as_df(start='A3')

gas_pipes = gas_pipes.drop('WKTFormat', axis=1) # delete WKTFormat column
oil_pipes = oil_pipes.drop('WKTFormat', axis=1)
pipes_df_orig = oil_pipes.copy() #pandas.concat([oil_pipes, gas_pipes], ignore_index=True)

#get other relevant sheets
country_ratios_df = spreadsheet.worksheet('title', 'Country ratios by pipeline').get_as_df()
owners_df_orig = spreadsheet.worksheet('title', 'Pipeline operators/owners (1/3)').get_as_df(start='A2')

country_ratios_df = country_ratios_df.loc[country_ratios_df.Wiki!='']

# remove empty cells for pipes, owners
pipes_df_orig = pipes_df_orig.loc[pipes_df_orig['PipelineName']!='']
pipes_df_orig = pipes_df_orig.loc[pipes_df_orig['Wiki']!='']
pipes_df_orig = pipes_df_orig.loc[pipes_df_orig.Fuel==fuel_type]

owners_df_orig = owners_df_orig.loc[owners_df_orig['ProjectID']!='']
owners_df_orig = owners_df_orig.loc[owners_df_orig['Wiki']!='']
owners_df_orig = owners_df_orig.loc[owners_df_orig.Status!='N/A']

owners_df_orig.set_index('ProjectID', inplace=True)

parent_metadata_df = spreadsheet.worksheet('title', 'Parent metadata (3/3)').get_as_df(start='A3')
parent_metadata_df.set_index('Parent', inplace=True)

In [5]:
country_ratios_df.replace('--', numpy.nan, inplace=True)

owners_df_orig.replace('',numpy.nan,inplace=True)
owners_df_orig.replace('--',numpy.nan,inplace=True)

pipes_df_orig.replace('--',numpy.nan,inplace=True)

  country_ratios_df.replace('--', numpy.nan, inplace=True)
  owners_df_orig.replace('--',numpy.nan,inplace=True)
  pipes_df_orig.replace('--',numpy.nan,inplace=True)


In [6]:
region_df_orig = spreadsheet.worksheet('title', 'Country dictionary').get_as_df(start='A2')

#region_name = 'Global'; region_df_touse = region_df_orig.copy()
#region_name = 'AsiaGasTracker'; region_df_touse = region_df_orig.loc[region_df_orig.AsiaGasTracker=='Yes']
#region_name = 'EuroGasTracker'; region_df_touse = region_df_orig.loc[region_df_orig.EuroGasTracker=='Yes']
region_name = 'AfricaGasTracker'; region_df_touse = region_df_orig.loc[region_df_orig.AfricaGasTracker=='Yes']
#region_df_agt.copy()

#region_df_touse = region_df_orig.copy()

In [7]:
region_df_touse_cleaned = region_df_touse.loc[(region_df_touse.Region!='--')&
                                            (region_df_touse.SubRegion!='--')]
multiindex_region_subregion = region_df_touse_cleaned.groupby(['Region','SubRegion'])['Country'].count().index
multiindex_region_subregion

MultiIndex([('Africa',    'Northern Africa'),
            ('Africa', 'Sub-Saharan Africa')],
           names=['Region', 'SubRegion'])

## file names with regional specifics

In [8]:
if fuel_type=='Gas':
    excel_writer = pandas.ExcelWriter(region_name+'-summary-sheets-gas-pipelines-'+str(datetime.date.today())+'.xlsx')
if fuel_type=='NGL':
    excel_writer = pandas.ExcelWriter(region_name+'-summary-sheets-NGL-pipelines-'+str(datetime.date.today())+'.xlsx')
if fuel_type=='Oil':
    excel_writer = pandas.ExcelWriter(region_name+'-summary-sheets-Oil-pipelines-'+str(datetime.date.today())+'.xlsx')

### create country-specific dataframes for region, country_ratios_df, owners_df

In [9]:
country_ratios_df_touse = country_ratios_df.loc[country_ratios_df['Country'].str.contains(
                                            '|'.join(region_df_touse['Country'].tolist()))]

# owners_df_touse = owners_df_orig.loc[owners_df_orig['Countries'].str.contains(
#                                             '|'.join(region_df_touse['Country'].tolist()))]

pipes_df_touse = pipes_df_orig.loc[pipes_df_orig['Countries'].str.contains(
                                            '|'.join(region_df_touse['Country'].tolist()))]

In [10]:
country_ratios_df

Unnamed: 0,PipelineName,SegmentName,ProjectID,Country,LengthEstimateKmByCountry,LengthPerCountryFraction,Region,SubRegion,RegionOld,PipelineBubbleRegion,...,Parent,H2Status,H2Type,CancelledYear,ProposalYear,ConstructionYear,ShelvedYear,StartYearEarliest,StartCountry,EndCountry
0,Alberta Clipper Oil Pipeline,,P0001,Canada,1066.328784,0.680000,Americas,Northern America,North America,North America,...,Enbridge [100.00%],,,,,,,2010,Canada,United States
1,Alberta Clipper Oil Pipeline,,P0001,United States,512.042577,0.320000,Americas,Northern America,North America,North America,...,Enbridge [100.00%],,,,,,,2010,Canada,United States
2,Athabasca Oil Pipeline,,P0002,Canada,522.239984,1.000000,Americas,Northern America,North America,North America,...,Enbridge [88.43%]; 23 Indigenous communities [...,,,,,,,1999,Canada,Canada
3,Bakken Expansion Pipeline,,P0004,Canada,150.276903,0.595268,Americas,Northern America,North America,North America,...,Enbridge [100.00%],,,,,,,2013,United States,Canada
4,Bakken Expansion Pipeline,,P0004,United States,102.1756,0.400000,Americas,Northern America,North America,North America,...,Enbridge [100.00%],,,,,,,2013,United States,Canada
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6585,Nigeria–Libya Gas Pipeline,,P7102,Niger,1241.350811,0.389414,Africa,Sub-Saharan Africa,Sub-Saharan Africa,Africa,...,,,,,2024,,,,Nigeria,Libya
6586,Pomeranian Hydrogen Cluster,SYSTEM/NETWORK INFO,P7103,Poland,561.549529,0.999882,Europe,Eastern Europe,Europe,Europe,...,,,new AND converted segments,,,,,2029,Poland,Poland
6587,Pomeranian Hydrogen Cluster,SYSTEM/NETWORK INFO,P7103,Germany,0.066229,0.000118,Europe,Western Europe,Europe,Europe,...,,,new AND converted segments,,,,,2029,Poland,Poland
6592,Malaysia Singapore Gas Pipeline,,P7105,Malaysia,35,0.500000,Asia,South-eastern Asia,SE Asia,Asia Pacific,...,,,,,,,,1991,Malaysia,Singapore


In [11]:
pipes_df_touse.head()

Unnamed: 0,PipelineNetworkGrouping,PipelineName,SegmentName,Wiki,ProjectID,Researcher,LastUpdated,Fuel,Fuel [ref],PipelineType,...,RouteAccuracy,RouteNotes,Route [ref],OtherEnglishNames,OtherLanguagePrimaryPipelineName,OtherLanguageAlternativePipelineNames,OtherLanguageSegmentName,AlternateRouteProjectIDs,CostUSDPerKm,CostEuroPerKm
158,,Haoudh El Hamra-Arzew Oil Pipeline,II,https://www.gem.wiki/Haoudh_El_Hamra-Arzew_Oil...,P0524,ZK,2023-03-20,Oil,,,...,medium,https://sonatrach.com/wp-content/uploads/2022/...,,OZ2,,,,,,
159,,Haoudh El Hamra-Skikda Oil Pipeline,,https://www.gem.wiki/Haoudh_El_Hamra-Skikda_Oi...,P0525,ZK,2023-03-20,Oil,,,...,medium,,,OK1,,,,,,
160,,Ronier–Djermaya Oil Pipeline,,https://www.gem.wiki/Ronier%E2%80%93Djermaya_O...,P0527,BL,2022-02-16,Oil,,,...,medium,https://agritrop.cirad.fr/567557/2/document_56...,,,,,,,,
161,,Chad–Cameroon Oil Pipeline,,https://www.gem.wiki/Chad%E2%80%93Cameroon_Oil...,P0528,BL,2023-04-12,Oil,,,...,medium,,,,,,,,3925233.645,3601131.784
162,,Sumed Oil Pipeline,Pipeline 1,https://www.gem.wiki/Sumed_Oil_Pipeline,P0530,,2023-04-26,Oil,,,...,medium,,,Suez-Mediterranean Pipeline,خط أنابيب سوميد ، خط أنابيب البحر المتوسط-السويس,,,,,


### sum LengthMergedKmByCountry and MergedKmByRegion

In [12]:
status_list = ['proposed', 
               'construction', 
               'shelved', 
               'cancelled', 
               'operating', 
               'idle', 
               'mothballed', 
               'retired']
country_list = sorted(list(set(country_ratios_df_touse['Country'])))
region_list = sorted(list(set(country_ratios_df_touse['Region'])))

In [13]:
excel_status_list = ['proposed', 
                     'construction', 
                     'in development (proposed + construction)', 
                     'shelved', 
                     'cancelled', 
                     'operating', 
                     'idle', 
                     'mothballed', 
                     'retired']
excel_status_list_with_countries = ['Country']+excel_status_list

In [14]:
country_ratios_df_subset_status

NameError: name 'country_ratios_df_subset_status' is not defined

In [15]:
country_ratios_df_subset = country_ratios_df_touse.loc[country_ratios_df_touse['Fuel']==fuel_type]

km_by_country = pandas.DataFrame(columns=status_list, index=country_list)
km_by_region = pandas.DataFrame(columns=status_list, index=multiindex_region_subregion)

print('===country-level calculations===')
for status in status_list:
    print(status)
    country_ratios_df_subset_status = country_ratios_df_subset[country_ratios_df_subset['Status']==status]
    km_by_country[status] = country_ratios_df_subset_status.groupby('Country')['LengthMergedKmByCountry'].sum()

print('===regional calculations===')
for status in status_list:
    print(status)
    country_ratios_df_subset_status = country_ratios_df_subset[country_ratios_df_subset['Status']==status]
    km_by_region[status] = country_ratios_df_subset_status.groupby(['Region','SubRegion'])['LengthMergedKmByCountry'].sum()

# fille NaN with 0.0
km_by_region = km_by_region.fillna(0)
km_by_country = km_by_country.fillna(0)

km_by_region['in development (proposed + construction)'] = km_by_region[['proposed','construction']].sum(axis=1)
km_by_country['in development (proposed + construction)'] = km_by_country[['proposed','construction']].sum(axis=1)

km_by_country = km_by_country[excel_status_list]
km_by_region = km_by_region[excel_status_list]

km_by_region.index.names = ['Region','Subregion']
km_by_country.index.name = 'Country'

km_by_region.loc['Total',:] = km_by_region.sum(axis=0).values
km_by_country.loc['Total',:] = km_by_country.sum(axis=0).values

# drop all-zero rows
km_by_country = km_by_country.loc[~(km_by_country==0).all(axis=1)]

km_by_country.replace(0,'',inplace=True)
km_by_region.replace(0,'',inplace=True)

km_by_region.to_excel(excel_writer, 'Kilometers by region')
km_by_country.to_excel(excel_writer, 'Kilometers by country')

===country-level calculations===
proposed
construction
shelved
cancelled
operating
idle
mothballed
retired
===regional calculations===
proposed
construction
shelved
cancelled
operating
idle
mothballed
retired


  km_by_region.to_excel(excel_writer, 'Kilometers by region')
  km_by_country.to_excel(excel_writer, 'Kilometers by country')


In [16]:
km_by_region

Unnamed: 0_level_0,Unnamed: 1_level_0,proposed,construction,in development (proposed + construction),shelved,cancelled,operating,idle,mothballed,retired
Region,Subregion,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Africa,Northern Africa,68.5,100.0,168.5,488.0,620.04,16288.8,,,
Africa,Sub-Saharan Africa,6481.26,1950.0,8431.26,498.13,2100.0,7457.94,,,
Total,,6549.76,2050.0,8599.76,986.13,2720.04,23746.74,,,


## pipeline km by parent company (owner) and project status

### first check that there are no missing projectids

### pipeline km by start year, type

In [17]:
pipes_started = pipes_df_touse.copy()
#pipes_started['StartYearLatest'].replace(numpy.nan,'',inplace=True)

if fuel_type == 'Gas':
    pipes_started = pipes_started[(pipes_started['Status'].isin(['Operating'])) &
                              (pipes_started['Fuel']=='Gas')]
if fuel_type == 'Oil':
    pipes_started = pipes_started[(pipes_started['Status'].isin(['Operating'])) &
                              (pipes_started['Fuel']=='Oil')]
if fuel_type == 'NGL':
    pipes_started = pipes_started[(pipes_started['Status'].isin(['Operating'])) &
                              (pipes_started['Fuel']=='NGL')]

pipes_started_sum = pipes_started.groupby('StartYearEarliest')['LengthMergedKm'].sum()

In [18]:
if fuel_type == 'Gas':
    km_by_start_year = pandas.DataFrame(index=list(range(1980,2024)), columns=['Gas pipeline km'])
    km_by_start_year.index.name = 'Start year'
    km_by_start_year['Gas pipeline km'] = pipes_started_sum
    km_by_start_year.replace(numpy.nan,0,inplace=True)

if fuel_type == 'Oil':
    km_by_start_year = pandas.DataFrame(index=list(range(1980,2024)), columns=['Oil pipeline km'])
    km_by_start_year.index.name = 'Start year'
    km_by_start_year['Oil pipeline km'] = pipes_started_sum
    km_by_start_year.replace(numpy.nan,0,inplace=True)

if fuel_type == 'NGL':
    km_by_start_year = pandas.DataFrame(index=list(range(1980,2024)), columns=['NGL pipeline km'])
    km_by_start_year.index.name = 'Start year'
    km_by_start_year['NGL pipeline km'] = pipes_started_sum
    km_by_start_year.replace(numpy.nan,0,inplace=True)

km_by_start_year.loc['Total',:] = km_by_start_year.sum(axis=0)

km_by_start_year.to_excel(excel_writer, 'Kilometers by start year')
#km_by_start_year

  km_by_start_year.to_excel(excel_writer, 'Kilometers by start year')


## save excel file

In [19]:
excel_writer.close()

## calculating stats for landing page

In [23]:
# number of projects tracked in total
print(pipes_df_touse.loc[pipes_df_touse.Fuel==fuel_type].shape[0], fuel_type+' pipeline projects tracked')
print(pipes_df_touse.loc[pipes_df_touse.Fuel==fuel_type]['LengthMergedKm'].sum()/1e6, 'M km tracked')

109 Oil pipeline projects tracked
0.03631758 M km tracked


In [24]:
# number of projects tracked in total
print(pipes_df_touse.loc[(pipes_df_touse.Fuel==fuel_type)&
                        (pipes_df_touse.Status.isin(['proposed','construction']))].shape[0], fuel_type+' pipeline projects tracked')
print(pipes_df_touse.loc[(pipes_df_touse.Fuel==fuel_type)&
                        (pipes_df_touse.Status.isin(['proposed','construction']))]['LengthMergedKm'].sum()/1e3, 'K km tracked')

8 Oil pipeline projects tracked
8.59976 K km tracked
