# Notes

code for producing GOIT pipelines summary stats, and for calculating landing page stats

this is saved as an Excel file, which Baird copies/pastes into the existing summary tables information on the drive here:
https://docs.google.com/spreadsheets/d/1OYH6D7c-D0FsL5GzBGijtkmvQCTkBUclj-UVoOieUFo/edit

In [3]:
import pandas
import numpy
import pygsheets
import datetime
import re
import pytz

In [4]:
# define the excel file to save tables in
current_time = datetime.datetime.now(pytz.timezone('US/Eastern')).strftime("%Y-%m-%d_T%H%M%S")

## import data

In [6]:
fuel_type = 'Gas'
#fuel_type = 'Oil'
#fuel_type = 'NGL'

In [7]:
gc = pygsheets.authorize(service_account_env_var='GDRIVE_API_CREDENTIALS')
#spreadsheet = gc.open_by_key('1WaBMIdfRWqSqXUw7_cKXo3RipyhPdnNN8flqEYfMZIA') # file to use for gas pipelines Dec 2023
#spreadsheet = gc.open_by_key('1foPLE6K-uqFlaYgLPAUxzeXfDO5wOOqE7tibNHeqTek') # CURRENT sheet
spreadsheet = gc.open_by_key('1-BGgR3XYqrS52UBtvfWnOuAwyw9QCT2gNMNDPjbZus4')

gas_pipes = spreadsheet.worksheet('title', 'Gas pipelines').get_as_df(start='A3')
oil_pipes = spreadsheet.worksheet('title', 'Oil/NGL pipelines').get_as_df(start='A3')

gas_pipes = gas_pipes.drop('WKTFormat', axis=1) # delete WKTFormat column
oil_pipes = oil_pipes.drop('WKTFormat', axis=1)
pipes_df_orig = gas_pipes.copy() #pandas.concat([oil_pipes, gas_pipes], ignore_index=True)

#get other relevant sheets
country_ratios_df = spreadsheet.worksheet('title', 'Country ratios by pipeline').get_as_df()
owners_df_orig = spreadsheet.worksheet('title', 'Pipeline operators/owners (1/3)').get_as_df(start='A2')

country_ratios_df = country_ratios_df.loc[country_ratios_df.Wiki!='']

# remove empty cells for pipes, owners
pipes_df_orig = pipes_df_orig.loc[pipes_df_orig['PipelineName']!='']
pipes_df_orig = pipes_df_orig.loc[pipes_df_orig['Wiki']!='']
pipes_df_orig = pipes_df_orig.loc[pipes_df_orig.Fuel==fuel_type]

owners_df_orig = owners_df_orig.loc[owners_df_orig['ProjectID']!='']
owners_df_orig = owners_df_orig.loc[owners_df_orig['Wiki']!='']
owners_df_orig = owners_df_orig.loc[owners_df_orig.Status!='N/A']

owners_df_orig.set_index('ProjectID', inplace=True)

parent_metadata_df = spreadsheet.worksheet('title', 'Parent metadata (3/3)').get_as_df(start='A2')
parent_metadata_df.set_index('Parent', inplace=True)

In [8]:
country_ratios_df.replace('--', numpy.nan, inplace=True)

owners_df_orig.replace('',numpy.nan,inplace=True)
owners_df_orig.replace('--',numpy.nan,inplace=True)

pipes_df_orig.replace('--',numpy.nan,inplace=True)

  country_ratios_df.replace('--', numpy.nan, inplace=True)
  owners_df_orig.replace('--',numpy.nan,inplace=True)
  pipes_df_orig.replace('--',numpy.nan,inplace=True)


In [11]:
region_df_orig = spreadsheet.worksheet('title', 'Country dictionary').get_as_df(start='A2')

region_name = 'Global'; region_df_touse = region_df_orig.copy()
#region_name = 'AsiaGasTracker'; region_df_touse = region_df_orig.loc[region_df_orig.AsiaGasTracker=='Yes']
#region_name = 'EuroGasTracker'; region_df_touse = region_df_orig.loc[region_df_orig.EuroGasTracker=='Yes']
#region_name = 'AfricaGasTracker'; region_df_touse = region_df_orig.loc[region_df_orig.AfricaGasTracker=='Yes']
#region_df_agt.copy()

#region_df_touse = region_df_orig.copy()

In [224]:
region_df_touse_cleaned = region_df_touse.loc[(region_df_touse.Region!='--')&
                                            (region_df_touse.SubRegion!='--')]
multiindex_region_subregion = country_ratios_df_gas.groupby(['Region','SubRegion'])['Country'].count().index
multiindex_region_country = country_ratios_df_gas.groupby(['Region','Country'])['Country'].count().index
multiindex_region_subregion

MultiIndex([(  'Africa',                 'Northern Africa'),
            (  'Africa',              'Sub-Saharan Africa'),
            ('Americas', 'Latin America and the Caribbean'),
            ('Americas',                'Northern America'),
            (    'Asia',                    'Central Asia'),
            (    'Asia',                    'Eastern Asia'),
            (    'Asia',              'South-eastern Asia'),
            (    'Asia',                   'Southern Asia'),
            (    'Asia',                    'Western Asia'),
            (  'Europe',                  'Eastern Europe'),
            (  'Europe',                 'Northern Europe'),
            (  'Europe',                 'Southern Europe'),
            (  'Europe',                  'Western Europe'),
            ( 'Oceania',       'Australia and New Zealand'),
            ( 'Oceania',                       'Melanesia')],
           names=['Region', 'SubRegion'])

In [225]:
year_list = list(range(2019,2024))

In [226]:
multiindex_region_country_tuples = list(zip(multiindex_region_country.get_level_values(0),multiindex_region_country.get_level_values(1)))

all_region_country_year_tuples = []
for item in multiindex_region_country_tuples:
    for year in year_list:
        all_region_country_year_tuples.append(tuple((item[0], item[1], year)))

In [227]:
country_ratios_df_gas = country_ratios_df.loc[country_ratios_df.Fuel=='Gas']
country_ratios_df_gas = country_ratios_df_gas.replace('',numpy.nan)

  country_ratios_df_gas = country_ratios_df_gas.replace('',numpy.nan)


In [228]:
# proposed
df1 = country_ratios_df_gas.rename(columns={'ProposalYear':'Year'}).loc[country_ratios_df_gas.Status=='proposed'].groupby(
    ['Region','Country','Year']
        )[['LengthMergedKmByCountry']].sum().rename(columns={'LengthMergedKmByCountry':'Proposed km'})

In [229]:
# proposed
df2 = country_ratios_df_gas.rename(columns={'ConstructionYear':'Year'}).loc[country_ratios_df_gas.Status=='construction'].groupby(
    ['Region','Country','Year']
        )[['LengthMergedKmByCountry']].sum().rename(columns={'LengthMergedKmByCountry':'Construction km'})

In [230]:
# operating
df3 = country_ratios_df_gas.rename(columns={'StartYearEarliest':'Year'}).loc[country_ratios_df_gas.Status=='operating'].groupby(
    ['Region','Country','Year']
        )[['LengthMergedKmByCountry']].sum().rename(columns={'LengthMergedKmByCountry':'Operating km'})

In [231]:
# proposed
df4 = country_ratios_df_gas.rename(columns={'ShelvedYear':'Year'}).loc[country_ratios_df_gas.Status=='shelved'].groupby(
    ['Region','Country','Year']
        )[['LengthMergedKmByCountry']].sum().rename(columns={'LengthMergedKmByCountry':'Shelved km'})

In [232]:
# proposed
df5 = country_ratios_df_gas.rename(columns={'CancelledYear':'Year'}).loc[country_ratios_df_gas.Status=='cancelled'].groupby(
    ['Region','Country','Year']
        )[['LengthMergedKmByCountry']].sum().rename(columns={'LengthMergedKmByCountry':'Cancelled km'})

In [233]:
all_dfs = df1.join(df2).join(df3).join(df4).join(df5)

In [234]:
#new_multiindex = pandas.MultiIndex.from_product([region_list,country_list,year_list], names=['Country', 'Region', 'Year'])
new_multiindex = pandas.MultiIndex.from_tuples(all_region_country_year_tuples, names=['Country','Region','Year'])




In [235]:
new_multiindex

MultiIndex([( 'Africa',          'Algeria', 2019),
            ( 'Africa',          'Algeria', 2020),
            ( 'Africa',          'Algeria', 2021),
            ( 'Africa',          'Algeria', 2022),
            ( 'Africa',          'Algeria', 2023),
            ( 'Africa',           'Angola', 2019),
            ( 'Africa',           'Angola', 2020),
            ( 'Africa',           'Angola', 2021),
            ( 'Africa',           'Angola', 2022),
            ( 'Africa',           'Angola', 2023),
            ...
            ('Oceania',      'New Zealand', 2019),
            ('Oceania',      'New Zealand', 2020),
            ('Oceania',      'New Zealand', 2021),
            ('Oceania',      'New Zealand', 2022),
            ('Oceania',      'New Zealand', 2023),
            ('Oceania', 'Papua New Guinea', 2019),
            ('Oceania', 'Papua New Guinea', 2020),
            ('Oceania', 'Papua New Guinea', 2021),
            ('Oceania', 'Papua New Guinea', 2022),
            ('O

In [236]:
all_dfs.reindex(new_multiindex).reset_index().replace(
    numpy.nan,'').to_excel('GGIT-yearly-stats-gas-pipelines.xlsx', index=False)