# Notes

code for producing GOIT pipelines summary stats, and for calculating landing page stats

this is saved as an Excel file, which Baird copies/pastes into the existing summary tables information on the drive here:
https://docs.google.com/spreadsheets/d/1OYH6D7c-D0FsL5GzBGijtkmvQCTkBUclj-UVoOieUFo/edit

In [20]:
import pandas
pandas.set_option("display.max_rows", 50, "display.max_columns", 50)

import numpy
import pygsheets
import datetime
import re

In [21]:
#fuel_type = 'Gas'
fuel_type = 'Oil'
#fuel_type = 'NGL'

if fuel_type=='Gas':
    excel_writer = pandas.ExcelWriter('GOIT-Summary-Sheets-Gas-'+str(datetime.date.today())+'.xlsx', engine='xlsxwriter')
if fuel_type=='NGL':
    excel_writer = pandas.ExcelWriter('GOIT-Summary-Sheets-NGL-'+str(datetime.date.today())+'.xlsx', engine='xlsxwriter')
if fuel_type=='Oil':
    excel_writer = pandas.ExcelWriter('GOIT-Summary-Sheets-Oil-'+str(datetime.date.today())+'.xlsx', engine='xlsxwriter')

## import data

In [22]:
credentials_directory = '/Users/baird/Dropbox/_google-api/'
gc = pygsheets.authorize(client_secret=credentials_directory+'client_secret.json')
#spreadsheet = gc.open_by_key('1foPLE6K-uqFlaYgLPAUxzeXfDO5wOOqE7tibNHeqTek')
spreadsheet = gc.open_by_key('1IQ_g9PCr6pexDvEBoi5jboNfamJtPQ63ezxyq0qRsu0') # file to use for June 2022

gas_pipes = spreadsheet.worksheet('title', 'Gas pipelines').get_as_df(start='A2')
oil_pipes = spreadsheet.worksheet('title', 'Oil/NGL pipelines').get_as_df(start='A2')

gas_pipes = gas_pipes.drop('WKTFormat', axis=1) # delete WKTFormat column
oil_pipes = oil_pipes.drop('WKTFormat', axis=1)
pipes_df_orig = pandas.concat([oil_pipes, gas_pipes], ignore_index=True)

#get other relevant sheets
country_ratios_df = spreadsheet.worksheet('title', 'Country ratios by pipeline').get_as_df()
owners_df_orig = spreadsheet.worksheet('title', 'Pipeline operators/owners (1/3)').get_as_df(start='A2')

# remove empty cells for pipes, owners
pipes_df_orig = pipes_df_orig[pipes_df_orig['PipelineName']!='']
pipes_df_orig = pipes_df_orig[pipes_df_orig['Wiki']!='']
owners_df_orig = owners_df_orig[owners_df_orig['ProjectID']!='']
owners_df_orig = owners_df_orig[owners_df_orig['Wiki']!='']
owners_df_orig = owners_df_orig[owners_df_orig.Status!='N/A']

owners_df_orig.set_index('ProjectID', inplace=True)

In [23]:
country_ratios_df.replace('--', numpy.nan, inplace=True)

owners_df_orig.replace('',numpy.nan,inplace=True)
owners_df_orig.replace('--',numpy.nan,inplace=True)


pipes_df_orig.replace('--',numpy.nan,inplace=True)

In [24]:
region_df_orig = spreadsheet.worksheet('title', 'Region dictionary').get_as_df()

In [25]:
region_df_eu = region_df_orig.loc[region_df_orig['EuropeanUnion']=='Yes']
region_df_egt = region_df_orig.loc[region_df_orig['EuroGasTracker']=='Yes']
region_df_europe = region_df_orig.loc[region_df_orig['Region']=='Europe']
region_df_eu_uk = region_df_orig.loc[(region_df_orig['Region']=='Europe') | 
                                          (region_df_orig['Country'].isin(['United Kingdom','Israel']))]
#region_df_global = region_df_orig.copy()

In [26]:
region_df_touse = region_df_orig.copy()

### create country-specific dataframes for region, country_ratios_df, owners_df

In [27]:
country_ratios_df_touse = country_ratios_df.copy()#.loc[country_ratios_df['Country'].str.contains(
                                           # '|'.join(region_df_touse['Country'].tolist()))]

owners_df_touse = owners_df_orig.copy()#.loc[owners_df_orig['Countries'].str.contains(
                                        #    '|'.join(region_df_touse['Country'].tolist()))]

pipes_df_touse = pipes_df_orig.copy()#loc[pipes_df_orig['Countries'].str.contains(
                                    #        '|'.join(region_df_touse['Country'].tolist()))]

### sum MergedKmByCountry and MergedKmByRegion

In [28]:
status_list = ['Proposed', 
               'Construction', 
               'Shelved', 
               'Cancelled', 
               'Operating', 
               'Idle', 
               'Mothballed', 
               'Retired']
country_list = sorted(list(set(country_ratios_df_touse['Country'])))
region_list = sorted(list(set(country_ratios_df_touse['Region'])))

In [29]:
excel_status_list = ['Proposed', 
                     'Construction', 
                     'In Development (Proposed + Construction)', 
                     'Shelved', 
                     'Cancelled', 
                     'Operating', 
                     'Idle', 
                     'Mothballed', 
                     'Retired']

In [30]:
country_ratios_df_subset = country_ratios_df_touse.copy()[country_ratios_df_touse['Fuel']==fuel_type]

km_by_country = pandas.DataFrame(columns=status_list, index=country_list)
km_by_region = pandas.DataFrame(columns=status_list, index=region_list)

print('===country-level calculations===')
for status in status_list:
    print(status)
    country_ratios_df_subset_status = country_ratios_df_subset[country_ratios_df_subset['Status']==status]
    km_by_country[status] = country_ratios_df_subset_status.groupby('Country')['MergedKmByCountry'].sum()

print('===regional calculations===')
for status in status_list:
    print(status)
    country_ratios_df_subset_status = country_ratios_df_subset[country_ratios_df_subset['Status']==status]
    km_by_region[status] = country_ratios_df_subset_status.groupby('Region')['MergedKmByCountry'].sum()

# fille NaN with 0.0
km_by_region = km_by_region.fillna(0)
km_by_country = km_by_country.fillna(0)

km_by_region['In Development (Proposed + Construction)'] = km_by_region[['Proposed','Construction']].sum(axis=1)
km_by_country['In Development (Proposed + Construction)'] = km_by_country[['Proposed','Construction']].sum(axis=1)

km_by_country = km_by_country[excel_status_list]
km_by_region = km_by_region[excel_status_list]

km_by_region.index.name = 'Region'
km_by_country.index.name = 'Country'

totals_row = km_by_region.sum(axis=0)
totals_row.name = 'Total'
km_by_region = km_by_region.append(totals_row)

totals_row = km_by_country.sum(axis=0)
totals_row.name = 'Total'
km_by_country = km_by_country.append(totals_row)

km_by_region.to_excel(excel_writer, 'Kilometers by region')
km_by_country.to_excel(excel_writer, 'Kilometers by country')

===country-level calculations===
Proposed
Construction
Shelved
Cancelled
Operating
Idle
Mothballed
Retired
===regional calculations===
Proposed
Construction
Shelved
Cancelled
Operating
Idle
Mothballed
Retired


## pipeline km by parent company (owner) and project status

In [31]:
owner_parent_calculations_df = pandas.DataFrame()
# needs country, km in each country columns as well

for idx,row in country_ratios_df_subset.iterrows():
    #print(row.ComboID)
    parent_string = row.Parent
    parent_list = re.sub(' \[.*?\]', '', parent_string).split('; ') # all entries must have a Owner [%] syntax
    percent_list = [float(i.rstrip('%'))/100. for i in re.findall('\\d+(?:\\.\\d+)?%', parent_string)]

    if parent_list.__len__()!=percent_list.__len__():
        if percent_list==[]:
            percent_list = [1/parent_list.__len__() for i in parent_list]
        else:
            nmissing = parent_list.__len__()-percent_list.__len__()
            # distribute nans evenly
            total = numpy.nansum(percent_list)
            leftover = 1-total
            percent_list += [leftover/nmissing]*nmissing
    for p_idx,parent in enumerate(parent_list):
        owner_parent_calculations_df = pandas.concat([owner_parent_calculations_df, 
                                                      pandas.DataFrame([{'Parent':parent, 'ProjectID':row.ProjectID, 
                                                                         'FractionOwnership':percent_list[p_idx],
                                                                         'Country':row.Country,
                                                                         'Status':row.Status,
                                                                         'MergedKmByCountry':row.MergedKmByCountry}])])

owner_parent_calculations_df['KmOwnership'] = owner_parent_calculations_df.FractionOwnership*owner_parent_calculations_df.MergedKmByCountry

In [32]:
unique_owner_list = owner_parent_calculations_df.Parent.sort_values().unique().tolist()

##################################################
# create km count by owner, status
##################################################
owners_km_by_status_df = pandas.DataFrame(0.0, index=unique_owner_list, columns=status_list)

for status in status_list:
    
    country_ratios_df_temporary = country_ratios_df_subset.loc[(country_ratios_df_subset.Status==status)]
    op_temporary = owner_parent_calculations_df.loc[owner_parent_calculations_df.ProjectID.isin(country_ratios_df_temporary.ProjectID)]
    op_sum = pandas.DataFrame(op_temporary.groupby('Parent', dropna=False)['KmOwnership'].sum(min_count=0))
    owners_km_by_status_df.loc[:,status] = op_sum

owners_km_by_status_df.index.name = 'Parent Company'
owners_km_by_status_df['In Development (Proposed + Construction)'] = owners_km_by_status_df[['Proposed','Construction']].sum(axis=1, min_count=1)
owners_km_by_status_df = owners_km_by_status_df[excel_status_list]

# rearrange the order of the columns for output
owners_km_by_status_df = owners_km_by_status_df[excel_status_list]

totals_row = owners_km_by_status_df.sum(axis=0)
totals_row.name = 'Total'
owners_km_by_status_df = owners_km_by_status_df.append(totals_row)

owners_km_by_status_df.replace(numpy.nan, '--', inplace=True)
owners_km_by_status_df.to_excel(excel_writer, sheet_name='Kilometers by owner')

### pipeline km by start year, type

In [33]:
pipes_started = pipes_df_touse.copy()
#pipes_started['StartYearLatest'].replace(numpy.nan,'',inplace=True)

if fuel_type == 'Gas':
    pipes_started = pipes_started[(pipes_started['Status'].isin(['Operating'])) &
                              (pipes_started['Fuel']=='Gas')]
if fuel_type == 'Oil':
    pipes_started = pipes_started[(pipes_started['Status'].isin(['Operating'])) &
                              (pipes_started['Fuel']=='Oil')]
if fuel_type == 'NGL':
    pipes_started = pipes_started[(pipes_started['Status'].isin(['Operating'])) &
                              (pipes_started['Fuel']=='NGL')]

pipes_started_sum = pipes_started.groupby('StartYearEarliest')['LengthMergedKm'].sum()

In [34]:
if fuel_type == 'Gas':
    km_by_start_year = pandas.DataFrame(index=list(range(1980,2022)), columns=['Gas pipeline km'])
    km_by_start_year.index.name = 'Start year'
    km_by_start_year['Gas pipeline km'] = pipes_started_sum
    km_by_start_year.replace(numpy.nan,0,inplace=True)

if fuel_type == 'Oil':
    km_by_start_year = pandas.DataFrame(index=list(range(1980,2022)), columns=['Oil pipeline km'])
    km_by_start_year.index.name = 'Start year'
    km_by_start_year['Oil pipeline km'] = pipes_started_sum
    km_by_start_year.replace(numpy.nan,0,inplace=True)

if fuel_type == 'NGL':
    km_by_start_year = pandas.DataFrame(index=list(range(1980,2022)), columns=['NGL pipeline km'])
    km_by_start_year.index.name = 'Start year'
    km_by_start_year['NGL pipeline km'] = pipes_started_sum
    km_by_start_year.replace(numpy.nan,0,inplace=True)

totals_row = km_by_start_year.sum(axis=0)
totals_row.name = 'Total'
km_by_start_year = km_by_start_year.append(totals_row)

km_by_start_year.to_excel(excel_writer, 'Kilometers by start year')

## save excel file

In [35]:
excel_writer.save()

## calculating stats for landing page

In [36]:
# number of projects tracked in total
print(pipes_df_orig.loc[pipes_df_orig['Fuel']=='Oil'].shape[0], 'oil pipeline projects tracked')
print(pipes_df_orig.loc[pipes_df_orig['Fuel']=='Oil']['LengthMergedKm'].sum(), 'km tracked')

910 oil pipeline projects tracked
402400.29 km tracked


In [37]:
# number of projects tracked in total
print(pipes_df_orig.loc[pipes_df_orig['Fuel']=='NGL'].shape[0], 'NGL pipeline projects tracked')
print(pipes_df_orig.loc[pipes_df_orig['Fuel']=='NGL']['LengthMergedKm'].sum(), 'km tracked')

61 NGL pipeline projects tracked
29052.520000000008 km tracked


In [38]:
pipes_df_oil = pipes_df_orig.loc[pipes_df_orig['Fuel']=='Oil']
pipes_df_oil.replace('',numpy.nan,inplace=True)

pipes_df_ngl = pipes_df_orig.loc[pipes_df_orig['Fuel']=='NGL']
pipes_df_ngl.replace('',numpy.nan,inplace=True)