# notes

code for producting summary tables and landing page stats for the GGIT LNG terminals update

this is saved as an Excel file, which Baird copies/pastes into the existing summary tables information on the drive here:
https://docs.google.com/spreadsheets/d/1NbEpGt2K5nY0XTSB_vlOyw9Ug8ZmvvOaRPuO9TgISIw/edit

In [1]:
import pandas
pandas.set_option("display.max_rows", 50, "display.max_columns", 500)

import numpy
import pygsheets
import re
import datetime

In [2]:
pandas.io.formats.excel.ExcelFormatter.header_style = None
excel_writer = pandas.ExcelWriter('GGIT-Terminals-SummarySheets-'+str(datetime.date.today())+'.xlsx', engine='xlsxwriter')

import Terminals_Current dataset

In [3]:
credentials_directory = '/Users/baird/Dropbox/_google-api/'
gc = pygsheets.authorize(client_secret=credentials_directory+'client_secret.json')
spreadsheet = gc.open_by_key('1MrghwBeCz8Tzgua7CWGg_KoXKVZsV7r0kHMYHYqnNTg') # July 2022 version

#spreadsheet[1] "Gas Pipelines" tab is the second index
terms_df_orig = spreadsheet.worksheet('title', 'Terminals').get_as_df(start='A2')
terms_df_orig = terms_df_orig.loc[terms_df_orig.Wiki!='']
terms_df_orig = terms_df_orig.loc[terms_df_orig.Fuel=='LNG']

In [4]:
#owner_parent_df_orig = spreadsheet.worksheet('title', 'Owner/parent formatted').get_as_df()
#owner_parent_df_orig = pandas.read_pickle('../owner-parent-scripts/GEM-terminals-owner-parent-strings-2022-08-10.pickle')
#owner_parent_df_orig = owner_parent_df_orig.loc[owner_parent_df_orig.index.isin(terms_df_orig.ComboID)]
#
#parents_df = spreadsheet.worksheet('title', 'Parent metadata (3/3)').get_as_df(start='A2')
#parents_df = parents_df.loc[parents_df.Parent!='']

In [5]:
region_df_orig = spreadsheet.worksheet('title', 'Region dictionary').get_as_df()

In [6]:
terms_df_orig.replace('--', numpy.nan, inplace=True)
#owners_df_orig.replace('--', numpy.nan, inplace=True)
#owners_df_orig.replace('', numpy.nan, inplace=True)

In [7]:
region_list = sorted(list(set(region_df_orig['Region'])))
# subset only countries with actual regions associated with them
country_list = sorted(list(set(region_df_orig.loc[region_df_orig['Region']!='']['Country'])))

### use "terms_df_touse" this to subset regions if necessary

In [8]:
terms_df_touse = terms_df_orig.copy()

In [9]:
status_list = ['Proposed', 
               'Construction', 
               'Shelved', 
               'Cancelled', 
               'Operating', 
               'Idle', 
               'Mothballed', 
               'Retired']

In [10]:
excel_status_list = ['Proposed', 
                     'Construction', 
                     'In Development (Proposed + Construction)', 
                     'Shelved', 
                     'Cancelled', 
                     'Operating', 
                     'Idle', 
                     'Mothballed', 
                     'Retired']

### no. export trains by country/region, project status

In [11]:
terms_df_subset = terms_df_touse.copy()[(terms_df_touse['Import/Export']=='Export') & (terms_df_touse['Fuel']=='LNG')]

ntrains_by_country = pandas.DataFrame(0, columns=status_list, index=country_list)
ntrains_by_region = pandas.DataFrame(0, columns=status_list, index=region_list)

print('===country-level calculations===')
for status in status_list:
    print(status)
    terms_df_subset_status = terms_df_subset.copy()[terms_df_subset['Status']==status]
    ntrains_by_country[status] = terms_df_subset_status.groupby('Country').size()

print('===country-level calculations===')
for status in status_list:
    print(status)
    terms_df_subset_status = terms_df_subset.copy()[terms_df_subset['Status']==status]
    ntrains_by_region[status] = terms_df_subset_status.groupby('Region').size()

# fille NaN with 0.0
ntrains_by_region = ntrains_by_region.fillna(0)
ntrains_by_country = ntrains_by_country.fillna(0)

ntrains_by_region['In Development (Proposed + Construction)'] = ntrains_by_region[['Proposed','Construction']].sum(axis=1)
ntrains_by_country['In Development (Proposed + Construction)'] = ntrains_by_country[['Proposed','Construction']].sum(axis=1)

ntrains_by_country = ntrains_by_country[excel_status_list]
ntrains_by_region = ntrains_by_region[excel_status_list]

ntrains_by_region.index.name = 'Region'
ntrains_by_country.index.name = 'Country'

totals_row = ntrains_by_region.sum(axis=0)
totals_row.name = 'Total'
ntrains_by_region = ntrains_by_region.append(totals_row)

totals_row = ntrains_by_country.sum(axis=0)
totals_row.name = 'Total'
ntrains_by_country = ntrains_by_country.append(totals_row)

ntrains_by_region.to_excel(excel_writer, sheet_name='LNG export trains by region')
ntrains_by_country.to_excel(excel_writer, sheet_name='LNG export trains by country')

===country-level calculations===
Proposed
Construction
Shelved
Cancelled
Operating
Idle
Mothballed
Retired
===country-level calculations===
Proposed
Construction
Shelved
Cancelled
Operating
Idle
Mothballed
Retired


### no. import trains by country/region, project status

In [12]:
terms_df_subset = terms_df_touse.copy()[(terms_df_touse['Import/Export']=='Import') & (terms_df_touse['Fuel']=='LNG')]

ntrains_by_country = pandas.DataFrame(0, columns=status_list, index=country_list)
ntrains_by_region = pandas.DataFrame(0, columns=status_list, index=region_list)

print('===country-level calculations===')
for status in status_list:
    print(status)
    terms_df_subset_status = terms_df_subset.copy()[terms_df_subset['Status']==status]
    ntrains_by_country[status] = terms_df_subset_status.groupby('Country').size()

print('===country-level calculations===')
for status in status_list:
    print(status)
    terms_df_subset_status = terms_df_subset.copy()[terms_df_subset['Status']==status]
    ntrains_by_region[status] = terms_df_subset_status.groupby('Region').size()

# fille NaN with 0.0
ntrains_by_region = ntrains_by_region.fillna(0)
ntrains_by_country = ntrains_by_country.fillna(0)

ntrains_by_region['In Development (Proposed + Construction)'] = ntrains_by_region[['Proposed','Construction']].sum(axis=1)
ntrains_by_country['In Development (Proposed + Construction)'] = ntrains_by_country[['Proposed','Construction']].sum(axis=1)

ntrains_by_country = ntrains_by_country[excel_status_list]
ntrains_by_region = ntrains_by_region[excel_status_list]

ntrains_by_region.index.name = 'Region'
ntrains_by_country.index.name = 'Country'

totals_row = ntrains_by_region.sum(axis=0)
totals_row.name = 'Total'
ntrains_by_region = ntrains_by_region.append(totals_row)

totals_row = ntrains_by_country.sum(axis=0)
totals_row.name = 'Total'
ntrains_by_country = ntrains_by_country.append(totals_row)

ntrains_by_country = ntrains_by_country.applymap(int).applymap(str)
ntrains_by_region = ntrains_by_region.applymap(int).applymap(str)

ntrains_by_region.to_excel(excel_writer, sheet_name='LNG import trains by region')
ntrains_by_country.to_excel(excel_writer, sheet_name='LNG import trains by country')

===country-level calculations===
Proposed
Construction
Shelved
Cancelled
Operating
Idle
Mothballed
Retired
===country-level calculations===
Proposed
Construction
Shelved
Cancelled
Operating
Idle
Mothballed
Retired


### capacity (mtpa) of import trains by country/region, project status

In [13]:
terms_df_subset = terms_df_touse.copy()[(terms_df_touse['Import/Export']=='Import') & (terms_df_touse['Fuel']=='LNG')]

cap_by_country = pandas.DataFrame(0, columns=status_list, index=country_list)
cap_by_region = pandas.DataFrame(0, columns=status_list, index=region_list)

print('===country-level calculations===')
for status in status_list:
    print(status)
    terms_df_subset_status = terms_df_subset.copy()[terms_df_subset['Status']==status]
    cap_by_country[status] = terms_df_subset_status.groupby('Country')['CapacityInMtpa'].sum()

print('===country-level calculations===')
for status in status_list:
    print(status)
    terms_df_subset_status = terms_df_subset.copy()[terms_df_subset['Status']==status]
    cap_by_region[status] = terms_df_subset_status.groupby('Region')['CapacityInMtpa'].sum()

#fille NaN with 0.0
cap_by_region = cap_by_region.fillna(0)
cap_by_country = cap_by_country.fillna(0)

cap_by_region['In Development (Proposed + Construction)'] = cap_by_region[['Proposed','Construction']].sum(axis=1)
cap_by_country['In Development (Proposed + Construction)'] = cap_by_country[['Proposed','Construction']].sum(axis=1)

cap_by_country = cap_by_country[excel_status_list]
cap_by_region = cap_by_region[excel_status_list]

cap_by_region.index.name = 'Region'
cap_by_country.index.name = 'Country'

totals_row = cap_by_region.sum(axis=0)
totals_row.name = 'Total'
cap_by_region = cap_by_region.append(totals_row)

totals_row = cap_by_country.sum(axis=0)
totals_row.name = 'Total'
cap_by_country = cap_by_country.append(totals_row)

cap_by_region.to_excel(excel_writer, sheet_name='LNG import capacity by region')
cap_by_country.to_excel(excel_writer, sheet_name='LNG import capacity by country')

===country-level calculations===
Proposed
Construction
Shelved
Cancelled
Operating
Idle
Mothballed
Retired
===country-level calculations===
Proposed
Construction
Shelved
Cancelled
Operating
Idle
Mothballed
Retired


### capacity (mtpa) of export trains by country/region, project status

In [14]:
terms_df_subset = terms_df_touse.copy()[(terms_df_touse['Import/Export']=='Export') & (terms_df_touse['Fuel']=='LNG')]

cap_by_country = pandas.DataFrame(0, columns=status_list, index=country_list)
cap_by_region = pandas.DataFrame(0, columns=status_list, index=region_list)

print('===country-level calculations===')
for status in status_list:
    print(status)
    terms_df_subset_status = terms_df_subset.copy()[terms_df_subset['Status']==status]
    cap_by_country[status] = terms_df_subset_status.groupby('Country')['CapacityInMtpa'].sum()

print('===country-level calculations===')
for status in status_list:
    print(status)
    terms_df_subset_status = terms_df_subset.copy()[terms_df_subset['Status']==status]
    cap_by_region[status] = terms_df_subset_status.groupby('Region')['CapacityInMtpa'].sum()

#fille NaN with 0.0
cap_by_region = cap_by_region.fillna(0)
cap_by_country = cap_by_country.fillna(0)

cap_by_region['In Development (Proposed + Construction)'] = cap_by_region[['Proposed','Construction']].sum(axis=1)
cap_by_country['In Development (Proposed + Construction)'] = cap_by_country[['Proposed','Construction']].sum(axis=1)

cap_by_country = cap_by_country[excel_status_list]
cap_by_region = cap_by_region[excel_status_list]

cap_by_region.index.name = 'Region'
cap_by_country.index.name = 'Country'

totals_row = cap_by_region.sum(axis=0)
totals_row.name = 'Total'
cap_by_region = cap_by_region.append(totals_row)

totals_row = cap_by_country.sum(axis=0)
totals_row.name = 'Total'
cap_by_country = cap_by_country.append(totals_row)

cap_by_region.to_excel(excel_writer, sheet_name='LNG export capacity by region')
cap_by_country.to_excel(excel_writer, sheet_name='LNG export capacity by country')

===country-level calculations===
Proposed
Construction
Shelved
Cancelled
Operating
Idle
Mothballed
Retired
===country-level calculations===
Proposed
Construction
Shelved
Cancelled
Operating
Idle
Mothballed
Retired


## parent analysis
### relies on parent-owner script already being run/completed

In [15]:
owner_parent_calculations_df = pandas.DataFrame()#columns=['Parent','ComboID','FractionOwnership','CapacityInMtpa'])#index=terms_df_orig.ComboID)#, columns = ['Country','Region','CapacityInMtpa'])

for idx,row in terms_df_orig.iterrows():
    parent_string = row.Parent
    parent_list = re.sub(' \[.*?\]', '', parent_string).split('; ') # all entries must have a Owner [%] syntax
    percent_list = [float(i.rstrip('%'))/100. for i in re.findall('\\d+(?:\\.\\d+)?%', parent_string)]

    if parent_list.__len__()!=percent_list.__len__():
        if percent_list==[]:
            percent_list = [1/parent_list.__len__() for i in parent_list]
            for p_idx,parent in enumerate(parent_list):
                owner_parent_calculations_df = pandas.concat([owner_parent_calculations_df, 
                                                              pandas.DataFrame([{'Parent':parent, 'ComboID':row.ComboID, 
                                                                                 'FractionOwnership':percent_list[p_idx],
                                                                                 'CapacityInMtpa':row.CapacityInMtpa}])])
        else:
            nmissing = parent_list.__len__()-percent_list.__len__()
            # distribute nans evenly
            total = numpy.nansum(percent_list)
            leftover = 1-total
            percent_list += [leftover/nmissing]*nmissing

    for p_idx,parent in enumerate(parent_list):
        owner_parent_calculations_df = pandas.concat([owner_parent_calculations_df, 
                                                      pandas.DataFrame([{'Parent':parent, 'ComboID':row.ComboID, 
                                                                         'FractionOwnership':percent_list[p_idx],
                                                                         'CapacityInMtpa':row.CapacityInMtpa}])])

owner_parent_calculations_df['CapacityInMtpaOwnership'] = owner_parent_calculations_df['FractionOwnership']*owner_parent_calculations_df['CapacityInMtpa']

In [16]:
owner_parent_calculations_df.loc[owner_parent_calculations_df.ComboID=='T031600']

Unnamed: 0,Parent,ComboID,FractionOwnership,CapacityInMtpa,CapacityInMtpaOwnership
0,Petronet LNG,T031600,0.475,2.7,1.2825
0,Mitsubishi,T031600,0.2625,2.7,0.70875
0,Sojitz Corporation,T031600,0.2625,2.7,0.70875


## parent analysis

### ntrains export by parent company, project status

In [17]:
unique_owner_list = owner_parent_calculations_df.Parent.sort_values().unique().tolist()

##################################################
# create train count by owner, status
##################################################
owners_ntrains_by_status_df = pandas.DataFrame(0.0, index=unique_owner_list, columns=status_list)

for status in status_list:
    
    terms_df_temporary = terms_df_orig.loc[(terms_df_orig.Status==status)&
                                           (terms_df_orig['Import/Export']=='Export')]
    op_temporary = owner_parent_calculations_df.loc[owner_parent_calculations_df.ComboID.isin(terms_df_temporary.ComboID)]
    op_sum = pandas.DataFrame(op_temporary.groupby('Parent', dropna=False)['FractionOwnership'].sum(min_count=0))
    owners_ntrains_by_status_df.loc[:,status] = op_sum

owners_ntrains_by_status_df.index.name = 'Parent Company'
owners_ntrains_by_status_df['In Development (Proposed + Construction)'] = owners_ntrains_by_status_df[['Proposed','Construction']].sum(axis=1, min_count=1)
owners_ntrains_by_status_df = owners_ntrains_by_status_df[excel_status_list]

# rearrange the order of the columns for output
owners_ntrains_by_status_df = owners_ntrains_by_status_df[excel_status_list]

totals_row = owners_ntrains_by_status_df.sum(axis=0)
totals_row.name = 'Total'
owners_ntrains_by_status_df = owners_ntrains_by_status_df.append(totals_row)

owners_ntrains_by_status_df.replace(numpy.nan, '--', inplace=True)
owners_ntrains_by_status_df.to_excel(excel_writer, sheet_name='LNG export trains by company')

### ntrains import by parent company, project status

In [18]:
unique_owner_list = owner_parent_calculations_df.Parent.sort_values().unique().tolist()

##################################################
# create train count by owner, status
##################################################
owners_ntrains_by_status_df = pandas.DataFrame(0.0, index=unique_owner_list, columns=status_list)

for status in status_list:
    
    terms_df_temporary = terms_df_orig.loc[(terms_df_orig.Status==status)&
                                           (terms_df_orig['Import/Export']=='Import')]
    op_temporary = owner_parent_calculations_df.loc[owner_parent_calculations_df.ComboID.isin(terms_df_temporary.ComboID)]
    op_sum = pandas.DataFrame(op_temporary.groupby('Parent', dropna=False)['FractionOwnership'].sum(min_count=0))
    owners_ntrains_by_status_df.loc[:,status] = op_sum

owners_ntrains_by_status_df.index.name = 'Parent Company'
owners_ntrains_by_status_df['In Development (Proposed + Construction)'] = owners_ntrains_by_status_df[['Proposed','Construction']].sum(axis=1, min_count=1)
owners_ntrains_by_status_df = owners_ntrains_by_status_df[excel_status_list]

# rearrange the order of the columns for output
owners_ntrains_by_status_df = owners_ntrains_by_status_df[excel_status_list]

totals_row = owners_ntrains_by_status_df.sum(axis=0)
totals_row.name = 'Total'
owners_ntrains_by_status_df = owners_ntrains_by_status_df.append(totals_row)

owners_ntrains_by_status_df.replace(numpy.nan, '--', inplace=True)
owners_ntrains_by_status_df.to_excel(excel_writer, sheet_name='LNG import trains by company')

## capacity import by parent company, status

In [19]:
unique_owner_list = owner_parent_calculations_df.Parent.sort_values().unique().tolist()

##################################################
# create train count by owner, status
##################################################
owners_capacity_by_status_df = pandas.DataFrame(0.0, index=unique_owner_list, columns=status_list)

for status in status_list:
    
    terms_df_temporary = terms_df_orig.loc[(terms_df_orig.Status==status)&
                                           (terms_df_orig['Import/Export']=='Import')]
    op_temporary = owner_parent_calculations_df.loc[owner_parent_calculations_df.ComboID.isin(terms_df_temporary.ComboID)]
    op_sum = pandas.DataFrame(op_temporary.groupby('Parent', dropna=False)['CapacityInMtpaOwnership'].sum(min_count=0))
    owners_capacity_by_status_df.loc[:,status] = op_sum

owners_capacity_by_status_df.index.name = 'Parent Company'
owners_capacity_by_status_df['In Development (Proposed + Construction)'] = owners_capacity_by_status_df[['Proposed','Construction']].sum(axis=1, min_count=1)
owners_capacity_by_status_df = owners_capacity_by_status_df[excel_status_list]

# rearrange the order of the columns for output
owners_ntrains_by_status_df = owners_capacity_by_status_df[excel_status_list]

totals_row = owners_capacity_by_status_df.sum(axis=0)
totals_row.name = 'Total'
owners_capacity_by_status_df = owners_capacity_by_status_df.append(totals_row)

owners_capacity_by_status_df.replace(numpy.nan, '--', inplace=True)
owners_ntrains_by_status_df.to_excel(excel_writer, sheet_name='LNG import capacity by company')

In [20]:
unique_owner_list = owner_parent_calculations_df.Parent.sort_values().unique().tolist()

##################################################
# create train count by owner, status
##################################################
owners_capacity_by_status_df = pandas.DataFrame(0.0, index=unique_owner_list, columns=status_list)

for status in status_list:
    
    terms_df_temporary = terms_df_orig.loc[(terms_df_orig.Status==status)&
                                           (terms_df_orig['Import/Export']=='Export')]
    op_temporary = owner_parent_calculations_df.loc[owner_parent_calculations_df.ComboID.isin(terms_df_temporary.ComboID)]
    op_sum = pandas.DataFrame(op_temporary.groupby('Parent', dropna=False)['CapacityInMtpaOwnership'].sum(min_count=0))
    owners_capacity_by_status_df.loc[:,status] = op_sum

owners_capacity_by_status_df.index.name = 'Parent Company'
owners_capacity_by_status_df['In Development (Proposed + Construction)'] = owners_capacity_by_status_df[['Proposed','Construction']].sum(axis=1, min_count=1)
owners_capacity_by_status_df = owners_capacity_by_status_df[excel_status_list]

# rearrange the order of the columns for output
owners_ntrains_by_status_df = owners_capacity_by_status_df[excel_status_list]

totals_row = owners_capacity_by_status_df.sum(axis=0)
totals_row.name = 'Total'
owners_capacity_by_status_df = owners_capacity_by_status_df.append(totals_row)

owners_capacity_by_status_df.replace(numpy.nan, '--', inplace=True)
owners_ntrains_by_status_df.to_excel(excel_writer, sheet_name='LNG export capacity by company')

### no. of operating terminals by start year, Import/Export type (1980–2021)

In [21]:
#terms_started_eu = terms_df_touse.copy()
#pipes_started_eu['StartYearLatest'].replace(numpy.nan,'',inplace=True)

year_indices = list(range(1980,2022))

terms_by_start_year_df = pandas.DataFrame(0.0, index=year_indices, columns=['Import terminals', 'Export trains'])

terms_started = terms_df_touse.copy()[(terms_df_touse['Status'].isin(['Operating'])) &
                              (terms_df_touse['Fuel']=='LNG') & (terms_df_touse['Import/Export']=='Import')]
terms_by_start_year_df['Import terminals'] = terms_started.groupby('StartYearEarliest').size()

terms_started = terms_df_touse.copy()[(terms_df_touse['Status'].isin(['Operating'])) &
                              (terms_df_touse['Fuel']=='LNG') & (terms_df_touse['Import/Export']=='Export')]
terms_by_start_year_df['Export trains'] = terms_started.groupby('StartYearEarliest').size()

terms_by_start_year_df.index.name = 'Start year'
terms_by_start_year_df.replace(numpy.nan,0,inplace=True)

totals_row = terms_by_start_year_df.sum(axis=0)
totals_row.name = 'Total'
terms_by_start_year_df = terms_by_start_year_df.append(totals_row)

owners_ntrains_by_status_df.to_excel(excel_writer, sheet_name='Terminals by start year')

### capacity of operating terminals by start year, Import/Export type (1980–2021)

In [22]:
#terms_started_eu = terms_df_touse.copy()
#pipes_started_eu['StartYearLatest'].replace(numpy.nan,'',inplace=True)

year_indices = list(range(1980,2022))

capacity_by_start_year_df = pandas.DataFrame(0.0, index=year_indices, columns=['Import terminals', 'Export trains'])

terms_started_eu = terms_df_touse.copy()[(terms_df_touse['Status'].isin(['Operating'])) &
                              (terms_df_touse['Fuel']=='LNG') & (terms_df_touse['Import/Export']=='Import')]
capacity_by_start_year_df['Import terminals'] = terms_started_eu.groupby('StartYearEarliest')['CapacityInMtpa'].sum()

terms_started_eu = terms_df_touse.copy()[(terms_df_touse['Status'].isin(['Operating'])) &
                              (terms_df_touse['Fuel']=='LNG') & (terms_df_touse['Import/Export']=='Export')]
capacity_by_start_year_df['Export trains'] = terms_started_eu.groupby('StartYearEarliest')['CapacityInMtpa'].sum()

capacity_by_start_year_df.index.name = 'Start year'
capacity_by_start_year_df.replace(numpy.nan,0,inplace=True)

totals_row = capacity_by_start_year_df.sum(axis=0)
totals_row.name = 'Total'
capacity_by_start_year_df = capacity_by_start_year_df.append(totals_row)

owners_ntrains_by_status_df.to_excel(excel_writer, sheet_name='Terminal capacity by start year')

## save excel file

In [23]:
excel_writer.save()

# landing page numbers

In [24]:
terms_df_orig.shape

(1172, 76)

In [25]:
terms_df_orig['CapacityInMtpa'].sum()

4393.73