In [1]:
import pandas
import numpy
import pygsheets
import datetime
import re
import pytz

In [2]:
# define the excel file to save tables in
current_time = datetime.datetime.now(pytz.timezone('US/Eastern')).strftime("%Y-%m-%d_T%H%M%S")
excel_writer = pandas.ExcelWriter('AsiaGasTracker-Pipelines-SummarySheets-'+current_time+'.xlsx')

import Pipelines_Current dataset

In [3]:
credentials_directory = '/Users/baird/Dropbox/_google-api/'
gc = pygsheets.authorize(service_account_env_var='GDRIVE_API_CREDENTIALS')
#spreadsheet = gc.open_by_key('1foPLE6K-uqFlaYgLPAUxzeXfDO5wOOqE7tibNHeqTek')
#spreadsheet = gc.open_by_key('1jP1GJbl-BjLbwb9BTblt0EQQr2hyMGNgdVL4I9_wQWQ') # January 2022 pipelines
#spreadsheet = gc.open_by_key('1bfPrp0w8Ruorq08Qe4hD8M3xVJ5e00phZ6ApFivO98k') # December 2022 pipelines
spreadsheet = gc.open_by_key('1LuTFPdSx5QmzNyc-mjjfM1GelPZ586yC-UXNAhnybJ4') # May 2023 version of December 2022 pipelines

gas_pipes = spreadsheet.worksheet('title', 'Gas pipelines').get_as_df(start='A2')
oil_pipes = spreadsheet.worksheet('title', 'Oil/NGL pipelines').get_as_df(start='A2')

pipes_df_orig = pandas.concat([oil_pipes, gas_pipes], ignore_index=True)

#get other relevant sheets
country_ratios_df = spreadsheet.worksheet('title', 'Country ratios by pipeline').get_as_df()
owners_df_orig = spreadsheet.worksheet('title', 'Pipeline operators/owners (1/3)').get_as_df(start='A2')

# remove empty cells for pipes, owners
pipes_df_orig = pipes_df_orig[pipes_df_orig['PipelineName']!='']
owners_df_orig = owners_df_orig[owners_df_orig['Wiki']!='']
owners_df_orig.set_index('ProjectID', inplace=True)

In [4]:
country_ratios_df.replace('--', numpy.nan, inplace=True)

owners_df_orig.replace('',numpy.nan,inplace=True)
owners_df_orig.replace('--',numpy.nan,inplace=True)

pipes_df_orig.replace('--',numpy.nan,inplace=True)

In [5]:
region_df_orig = spreadsheet.worksheet('title', 'Country dictionary').get_as_df(start='A2')
region_df_africa = region_df_orig.loc[region_df_orig['AsiaGasTracker']=='Yes']
region_df_touse = region_df_africa.copy()

### create region-specific dataframes for region, country_ratios_df, owners_df

In [6]:
country_ratios_df_touse = country_ratios_df.loc[country_ratios_df['Country'].isin(
                                            region_df_touse['Country'].tolist())]

owners_df_touse = owners_df_orig.loc[
    ~owners_df_orig['Countries'].apply(
        lambda x: set(x.split(', ')).isdisjoint(set(region_df_touse['Country'].tolist()))
    )]

pipes_df_touse = pipes_df_orig.loc[
    ~pipes_df_orig['Countries'].apply(
        lambda x: set(x.split(', ')).isdisjoint(set(region_df_touse['Country'].tolist()))
    )]

In [7]:
fuel_type = 'Gas'
#fuel_type = 'Oil'
#fuel_type = 'NGL'

pipes_df_touse = pipes_df_touse.loc[pipes_df_touse.Fuel==fuel_type]

### sum MergedKmByCountry and MergedKmByRegion

In [8]:
status_list = ['Proposed', 
               'Construction', 
               'Shelved', 
               'Cancelled', 
               'Operating', 
               'Idle', 
               'Mothballed', 
               'Retired']
country_list = sorted(set(region_df_touse.Country.to_list())) #sorted(list(set(country_ratios_df_touse['Country'])))
region_list = sorted(set(region_df_touse.Region.tolist())) #sorted(list(set(country_ratios_df_touse['Region'])))

In [9]:
excel_status_list = ['Proposed', 
                     'Construction', 
                     'In Development (Proposed + Construction)', 
                     'Shelved', 
                     'Cancelled', 
                     'Operating', 
                     'Idle', 
                     'Mothballed', 
                     'Retired']
excel_status_list_with_countries = ['Country']+excel_status_list

In [10]:
country_ratios_df_subset = country_ratios_df_touse.copy()[country_ratios_df_touse['Fuel']==fuel_type]

km_by_country = pandas.DataFrame(columns=status_list, index=country_list)
km_by_region = pandas.DataFrame(columns=status_list, index=region_list)

print('===country-level calculations===')
for status in status_list:
    print(status)
    country_ratios_df_subset_status = country_ratios_df_subset[country_ratios_df_subset['Status']==status]
    km_by_country[status] = country_ratios_df_subset_status.groupby('Country')['MergedKmByCountry'].sum()

print('===regional calculations===')
for status in status_list:
    print(status)
    country_ratios_df_subset_status = country_ratios_df_subset[country_ratios_df_subset['Status']==status]
    km_by_region[status] = country_ratios_df_subset_status.groupby('Region')['MergedKmByCountry'].sum()

# fille NaN with 0.0
km_by_region = km_by_region.fillna(0)
km_by_country = km_by_country.fillna(0)

km_by_region['In Development (Proposed + Construction)'] = km_by_region[['Proposed','Construction']].sum(axis=1)
km_by_country['In Development (Proposed + Construction)'] = km_by_country[['Proposed','Construction']].sum(axis=1)

km_by_country = km_by_country[excel_status_list]
km_by_region = km_by_region[excel_status_list]

km_by_region.index.name = 'Region'
km_by_country.index.name = 'Country'

totals_row = km_by_region.sum(axis=0)
totals_row.name = 'Total'
km_by_region = km_by_region.append(totals_row)

totals_row = km_by_country.sum(axis=0)
totals_row.name = 'Total'
km_by_country = km_by_country.append(totals_row)

km_by_region.replace(0,'',inplace=True)
km_by_country.replace(0,'',inplace=True)

km_by_region.to_excel(excel_writer, 'Kilometers by region')
km_by_country.to_excel(excel_writer, 'Kilometers by country')

===country-level calculations===
Proposed
Construction
Shelved
Cancelled
Operating
Idle
Mothballed
Retired
===regional calculations===
Proposed
Construction
Shelved
Cancelled
Operating
Idle
Mothballed
Retired


  km_by_region = km_by_region.append(totals_row)
  km_by_country = km_by_country.append(totals_row)


In [11]:
owner_parent_calculations_df = pandas.DataFrame()

for idx,row in pipes_df_touse.iterrows():
    parent_string = row.Parent
    #print(parent_string)
    parent_list = re.sub(' \[.*?\]', '', parent_string).split('; ') # all entries must have a Owner [%] syntax
    percent_list = [float(i.rstrip('%'))/100. for i in re.findall('\\d+(?:\\.\\d+)?%', parent_string)]

    if parent_list.__len__()!=percent_list.__len__():
        if percent_list==[]:
            percent_list = [1/parent_list.__len__() for i in parent_list]
        else:
            nmissing = parent_list.__len__()-percent_list.__len__()
            # distribute nans evenly
            total = numpy.nansum(percent_list)
            leftover = 1-total
            percent_list += [leftover/nmissing]*nmissing

    # how many countries is this pipeline in?
    country_ratios_df_temp = country_ratios_df.loc[country_ratios_df.ProjectID==row.ProjectID]
    #print(country_ratios_df_temp)
    country_ratios_df_temp_region = country_ratios_df.loc[(country_ratios_df.ProjectID==row.ProjectID)&
                                                          (country_ratios_df.Country.isin(region_df_touse.Country.tolist()))]
    # fraction of this pipeline that is within the regional countries we care about
    fraction_in_region = country_ratios_df_temp_region.MergedKmByCountry.sum()/country_ratios_df_temp.MergedKmByCountry.sum()
    
    for p_idx,parent in enumerate(parent_list):
        for country in country_ratios_df_temp_region.Country.tolist():
            merged_km_by_country = country_ratios_df.loc[(country_ratios_df.ProjectID==row.ProjectID)&
                                                         (country_ratios_df.Country==country),'MergedKmByCountry'].values[0]
            owner_parent_calculations_df = pandas.concat([owner_parent_calculations_df, 
                                                          pandas.DataFrame([{'Parent':parent, 'ProjectID':row.ProjectID, 
                                                                             'FractionOwnership':percent_list[p_idx],
                                                                             'LengthMergedKm':row.LengthMergedKm,
                                                                             'FractionInRegion':fraction_in_region,
                                                                             'MergedKmByCountry':merged_km_by_country,
                                                                             'Country':country,
                                                                             'Status':row.Status,
                                                                             'KmOwnershipInCountry':percent_list[p_idx]*merged_km_by_country}])])

owner_parent_calculations_df['KmOwnership'] = owner_parent_calculations_df['FractionOwnership']*owner_parent_calculations_df['LengthMergedKm']
owner_parent_calculations_df['KmOwnershipInRegion'] = owner_parent_calculations_df['FractionOwnership']*owner_parent_calculations_df['LengthMergedKm']*owner_parent_calculations_df['FractionInRegion']

  fraction_in_region = country_ratios_df_temp_region.MergedKmByCountry.sum()/country_ratios_df_temp.MergedKmByCountry.sum()
  fraction_in_region = country_ratios_df_temp_region.MergedKmByCountry.sum()/country_ratios_df_temp.MergedKmByCountry.sum()
  fraction_in_region = country_ratios_df_temp_region.MergedKmByCountry.sum()/country_ratios_df_temp.MergedKmByCountry.sum()
  fraction_in_region = country_ratios_df_temp_region.MergedKmByCountry.sum()/country_ratios_df_temp.MergedKmByCountry.sum()
  fraction_in_region = country_ratios_df_temp_region.MergedKmByCountry.sum()/country_ratios_df_temp.MergedKmByCountry.sum()
  fraction_in_region = country_ratios_df_temp_region.MergedKmByCountry.sum()/country_ratios_df_temp.MergedKmByCountry.sum()
  fraction_in_region = country_ratios_df_temp_region.MergedKmByCountry.sum()/country_ratios_df_temp.MergedKmByCountry.sum()
  fraction_in_region = country_ratios_df_temp_region.MergedKmByCountry.sum()/country_ratios_df_temp.MergedKmByCountry.sum()
  fracti

In [12]:
owners_km_by_status_df = owner_parent_calculations_df.groupby(['Parent','Country','Status'])[['KmOwnershipInCountry']].sum()

owners_km_by_status_df = owner_parent_calculations_df.groupby(['Parent','Country','Status']).agg(
    {'KmOwnershipInCountry':['sum']}).unstack().droplevel(axis=1, level=[0,1])

owners_km_by_status_df = owners_km_by_status_df.reindex(columns=status_list)
owners_km_by_status_df = owners_km_by_status_df.reset_index().set_index('Parent')
owners_km_by_status_df.columns.name = None

owners_km_by_status_df['In Development (Proposed + Construction)'] = \
    owners_km_by_status_df[['Construction','Proposed']].sum(min_count=1, axis=1)

# rearrange the order of the columns for output
owners_km_by_status_df = owners_km_by_status_df[excel_status_list_with_countries]

totals_row = owners_km_by_status_df.sum(axis=0, min_count=0)
totals_row.name = 'Total'
owners_km_by_status_df = owners_km_by_status_df.append(totals_row)
owners_km_by_status_df.loc['Total','Country'] = ''

owners_km_by_status_df = owners_km_by_status_df.replace(numpy.nan, '')
owners_km_by_status_df = owners_km_by_status_df.replace(0, '')

# missing_columns = set(status_list)-set(owners_km_by_status_df.columns)
# for col in missing_columns:
#     owners_km_by_status_df[col] = 0

owners_km_by_status_df.to_excel(excel_writer, sheet_name='Pipeline km by company and country')

  owners_km_by_status_df = owners_km_by_status_df.append(totals_row)


### pipeline km by start year, type

In [13]:
pipes_started_eu = pipes_df_touse.copy()
#pipes_started_eu['StartYearLatest'].replace(numpy.nan,'',inplace=True)

pipes_started_eu = pipes_started_eu[(pipes_started_eu['Status'].isin(['Operating'])) &
                              (pipes_started_eu['Fuel']=='Gas')]
pipes_started_eu_sum = pipes_started_eu.groupby('StartYearEarliest')['LengthMergedKm'].sum()

In [14]:
km_by_start_year = pandas.DataFrame(index=list(range(1980,2023)), columns=['Gas pipeline km'])
km_by_start_year.index.name = 'Start year'

km_by_start_year['Gas pipeline km'] = pipes_started_eu_sum
km_by_start_year.replace(numpy.nan,0,inplace=True)

totals_row = km_by_start_year.sum(axis=0)
totals_row.name = 'Total'
km_by_start_year = km_by_start_year.append(totals_row)

km_by_start_year.replace(0,'',inplace=True)

km_by_start_year.to_excel(excel_writer, 'Kilometers by start year')

  km_by_start_year = km_by_start_year.append(totals_row)


## save excel file

In [15]:
excel_writer.save()

  excel_writer.save()
