# owner-parent string formatting code

NOTE: owner/parent research is incomplete; presume all owners are parents if we haven't updated them yet

In [1]:
import pandas
import pygsheets
import datetime
import numpy
import xarray

In [43]:
credentials_directory = '/Users/baird/Dropbox/_google-api/'
gc = pygsheets.authorize(client_secret=credentials_directory+'client_secret.json')
spreadsheet = gc.open_by_key('1tcS6Wd-Wp-LTDpLzFgJY_RSNDnbyubW3J_9HKIAys4A')
#spreadsheet = gc.open_by_key('129b9YeQn7uAIsMgyB5uleT2QZXYxjD_UetnqwVyoRaI')

#spreadsheet[1] "Gas Pipelines" tab is the second index
terms_df_orig = spreadsheet.worksheet('title', 'Terminals').get_as_df(start='A2')
terms_df_orig.replace('--',numpy.nan,inplace=True)

In [44]:
owners_df_orig = spreadsheet.worksheet('title', 'Terminal operators/owners (1/3)').get_as_df(start='A2')
owners_df_orig = owners_df_orig.loc[owners_df_orig.ComboID!='']
owners_df_orig = owners_df_orig.loc[owners_df_orig.Wiki!='']
owners_df = owners_df_orig.replace('',numpy.nan)

owner_parent_links_df = spreadsheet.worksheet('title', 'Owner–parent relationships (2/3)').get_as_df()
# only keep the owners with a checked relationship
owner_parent_links_df = owner_parent_links_df.loc[owner_parent_links_df['Parent–Owner Relationship Checked?']=='yes']
owner_parent_links_df.replace('',numpy.nan,inplace=True)

parents_df = spreadsheet.worksheet('title', 'Parent metadata (3/3)').get_as_df(start='A2')
parents_df = parents_df.loc[parents_df.Parent!='']


owners_df.set_index('ComboID', inplace=True)
owner_parent_links_df.set_index('Owner', inplace=True)
parents_df.set_index('Parent', inplace=True)

# ****************************************

## create list of owner and parent column names
owner_pct_col_names = []
owner_col_names = []

parent_pct_col_names = []
parent_col_names = []

for num in range(1,10+1):
    owner_pct_col = f'Owner{num}%'
    owner_pct_col_names.append(owner_pct_col)
    
    owner_col = f'Owner{num}'
    owner_col_names.append(owner_col)
    
    parent_pct_col = f'Parent{num}%'
    parent_pct_col_names.append(parent_pct_col)
    
    parent_col = f'Parent{num}'
    parent_col_names.append(parent_col)

# ****************************************
# FOR CHINA
qcc_owners_df = owners_df.loc[(owners_df.Country=='China')&(~owners_df['QCCOwner(业主单位)'].isnull())]
owners_df.loc[qcc_owners_df.index,'Owner1'] = qcc_owners_df['QCCOwner(业主单位)']
owners_df.loc[qcc_owners_df.index,'Owner1%'] = '100.00%'

qcc_df_temporary = pandas.DataFrame({'Parent1':qcc_owners_df['QCCOwner(业主单位)'].unique(), 'Parent1%':'100.00%', 
                                    'Parent-Owner Relationship Checked?':'yes'}, index=qcc_owners_df['QCCOwner(业主单位)'].unique())
owner_parent_links_df = pandas.concat([owner_parent_links_df, qcc_df_temporary])

qcc_df_temporary = pandas.DataFrame({'ParentHQCountry':'China'}, index=qcc_owners_df['QCCOwner(业主单位)'].unique())
parents_df = pandas.concat([parents_df, qcc_df_temporary])

# ****************************************
## fill in missing parent info by borrowing owner info
owners_FULL_set = owners_df[owner_col_names].stack().dropna().unique().tolist() # from owners_df
owners_researched_set = list(set(owner_parent_links_df.index.to_list()))#+['Unknown'] # only existing owners, plus 'Unknown'
owners_diff = list(set(owners_FULL_set)-set(owners_researched_set))
owners_diff.append('Unknown')

# update owner_parent_links_df with these extra owners
owner_parent_links_df = pandas.concat([owner_parent_links_df, pandas.DataFrame(index=owners_diff, columns=owner_parent_links_df.columns)])
owner_parent_links_df['Parent1'].loc[owners_diff] = owners_diff
owner_parent_links_df['Parent1%'].loc[owners_diff] = '100.00%'

# ****************************************
# update parents_df with these as well
# note countries will be unknkown...
parents_set = list(set(parents_df.index.to_list()))
parents_diff = list(set(owners_diff)-set(parents_set))
parents_diff.append('Unknown')
parents_df = pandas.concat([parents_df, pandas.DataFrame(numpy.nan, index=parents_diff, columns=parents_df.columns)])

In [45]:
comboid_set = list(set(owners_df.index.to_list()))

## make a giant xarray Dataset to house the parent-owner-combo_id information

In [6]:
parents_da = xarray.DataArray(data=parents_df.index.unique().tolist())#owner_parent_links_df
owners_da = xarray.DataArray(data=owner_parent_links_df.index.unique().tolist())
comboid_da = xarray.DataArray(data=comboid_set)

In [7]:
owner_fraction = numpy.empty((parents_da.size, owners_da.size, comboid_da.size), float)
parent_fraction = numpy.empty((parents_da.size, owners_da.size, comboid_da.size), float)
parent_country = numpy.empty((parents_da.size, owners_da.size, comboid_da.size), str)
capacity_in_mtpa = numpy.empty((parents_da.size, owners_da.size, comboid_da.size), float)
owner_parent_tf = numpy.empty((parents_da.size, owners_da.size, comboid_da.size), bool)

owner_fraction[:] = numpy.nan
parent_fraction[:] = numpy.nan
parent_country[:] = numpy.nan
capacity_in_mtpa[:] = numpy.nan

owners_not_accounted_for = []

poc_ds = xarray.Dataset(data_vars = dict(
                                OwnerFraction=(['Parent','Owner','ComboID'], owner_fraction),
                                ParentFraction=(['Parent','Owner','ComboID'], parent_fraction),
                                ParentCountry=(['Parent','Owner','ComboID'], parent_country),
                                CapacityInMtpa=(['Parent','Owner','ComboID'], capacity_in_mtpa),
                                OwnerParentTF=(['Parent','Owner','ComboID'], owner_parent_tf)),
               coords = dict(
                   Parent=(['Parent'], parents_da.values),
                   Owner=(['Owner'], owners_da.values),
                   ComboID=(['ComboID'], comboid_da.values)))

## fill in the Dataset

In [8]:
# iterate through owners_df
for combo_id,row in owners_df.iterrows():
    owner_list = row[owner_col_names].dropna().tolist()
    owner_pct_vals = row[owner_pct_col_names].str.strip('%').astype('float').array/100.

    # SKIP ANYTHING THAT ISN'T IN THE PARENTS METADATA TAB:
    #if set(owner_list).issubset(owners_set)==False:
    #    owners_not_accounted_for+=owner_list
    #    continue # loops to the next 

    # if no owner or parent info, skip but record other stuff
    if owner_list==[]:
        owner='Unknown'
        parent='Unknown'
        poc_ds['ParentFraction'].loc[dict(Parent=parent, Owner=owner, ComboID=combo_id)] = numpy.nan
        poc_ds['OwnerParentTF'].loc[dict(Parent=parent, Owner=owner, ComboID=combo_id)] = True
        poc_ds['CapacityInMtpa'].loc[dict(Parent=parent, Owner=owner, ComboID=combo_id)] = terms_df_orig.loc[terms_df_orig.ComboID==combo_id]['CapacityInMtpa'].values[0]
        poc_ds['ParentCountry'].loc[dict(Parent=parent, Owner=owner, ComboID=combo_id)] = terms_df_orig.loc[terms_df_orig.ComboID==combo_id]['Country'].values[0]
    
    else:
        for o_idx,owner in enumerate(owner_list):
            parent_list = owner_parent_links_df.loc[owner][parent_col_names].squeeze().dropna().tolist()
            parent_pct_vals = owner_parent_links_df.loc[owner][parent_pct_col_names].str.strip('%').astype('float').array/100.
            #if owner not in owners_set:
            #    owners_not_accounted_for.append(owner)
            #    continue
            for p_idx,parent in enumerate(parent_list):
                poc_ds['CapacityInMtpa'].loc[dict(Parent=parent, Owner=owner, ComboID=combo_id)] = terms_df_orig.loc[terms_df_orig.ComboID==combo_id]['CapacityInMtpa'].values[0]
                poc_ds['OwnerFraction'].loc[dict(Parent=parent, Owner=owner, ComboID=combo_id)] = owner_pct_vals[o_idx]
                poc_ds['ParentFraction'].loc[dict(Parent=parent, Owner=owner, ComboID=combo_id)] = parent_pct_vals[p_idx]
                poc_ds['ParentCountry'].loc[dict(Parent=parent, Owner=owner, ComboID=combo_id)] = parents_df.loc[parent]['ParentHQCountry']
                poc_ds['OwnerParentTF'].loc[dict(Parent=parent, Owner=owner, ComboID=combo_id)] = True

## placeholder dataframe to eventually fill in strings

In [76]:
owner_parent_strings_df = pandas.DataFrame(index=comboid_da.values, columns=['OwnerList','ParentList',
                                                                    'OwnerString','ParentString'])
#                                                                    'OwnerPercentsArrayWithNans','ParentPercentsArrayWithNans',
#                                                                    'OwnerPercentsArray','ParentPercentsArray',
#                                                                    'ParentOwnrshpArray'])

## manipulate huge dataset and fill in dataframe strings

In [124]:
for combo_id in comboid_da.values[:20]: #['T024006']:#
    parent_indices, owner_indices = numpy.where(poc_ds['OwnerParentTF'].loc[dict(ComboID=combo_id)].values == True)
    parent_indices = list(parent_indices)
    owner_indices = list(owner_indices)
    parent_indices_unique = list(set(parent_indices))
    owner_indices_unique = list(set(owner_indices))
    
    owner_list_repeats = list(owners_da[owner_indices].values)
    parent_list_repeats = list(parents_da[parent_indices].values)
    
    owner_fractions_list_repeats = poc_ds['OwnerFraction'].loc[dict(ComboID=combo_id)].values[parent_indices, owner_indices]
    parent_fractions_list_repeats = poc_ds['ParentFraction'].loc[dict(ComboID=combo_id)].values[parent_indices, owner_indices]
    
    parent_fractions_list_repeats = parent_fractions_list_repeats * owner_fractions_list_repeats
    if set(owner_list_repeats).__len__()==1:
        owner_fractions_list_repeats /= owner_fractions_list_repeats.__len__()
    
    owner_frac_df = pandas.DataFrame({'Owners':owner_list_repeats,'OwnerFractions':owner_fractions_list_repeats})
    parent_frac_df = pandas.DataFrame({'Parents':parent_list_repeats,'ParentFractions':parent_fractions_list_repeats})
    
    # in some cases owner and parent will no longer be same size
    owner_frac_df = pandas.DataFrame(owner_frac_df.groupby(by=['Owners'], dropna=False)['OwnerFractions'].sum(min_count=1))
    parent_frac_df = pandas.DataFrame(parent_frac_df.groupby(by=['Parents'], dropna=False)['ParentFractions'].sum(min_count=1))
    #if owner_frac_df.index.size != parent_frac_df.index.size:

    owner_frac_df.sort_values('OwnerFractions', ascending=False, inplace=True)
    parent_frac_df.sort_values('ParentFractions', ascending=False, inplace=True)

    parent_formatted_string = ('; ').join(list(parent_frac_df.index + (parent_frac_df['ParentFractions']*100).map(' [{:,.2f}%]'.format)))
    owner_formatted_string = ('; ').join(list(owner_frac_df.index + (owner_frac_df['OwnerFractions']*100).map(' [{:,.2f}%]'.format)))

    parent_formatted_string = parent_formatted_string.replace('nan%', 'unknown %')
    owner_formatted_string = owner_formatted_string.replace('nan%', 'unknown %')
    
    owner_parent_strings_df.loc[combo_id,'OwnerString'] = owner_formatted_string
    owner_parent_strings_df.loc[combo_id,'ParentString'] = parent_formatted_string
    owner_parent_strings_df.loc[combo_id,'OwnerList'] = list(owner_frac_df.index)
    owner_parent_strings_df.loc[combo_id,'ParentList'] = list(parent_frac_df.index)
    
    owner_parent_strings_df.replace('','--',inplace=True)
    #print(owner_frac_df.OwnerFractions.sum())
    if True==True:
        print(owner_frac_df)
        print('owner', owner_fractions_list_repeats)
        print(owner_formatted_string)
        print('parent', parent_fractions_list_repeats)
        print(parent_formatted_string)
        print(combo_id)
        print()

                      OwnerFractions
Owners                              
Chevron Australia             0.4733
ExxonMobil Australia          0.2500
Shell Australia               0.2500
Osaka Gas Australia           0.0125
Tokyo Gas Australia           0.0100
JERA Australia                0.0042
owner [0.4733 0.25   0.0042 0.0125 0.25   0.01  ]
Chevron Australia [47.33%]; ExxonMobil Australia [25.00%]; Shell Australia [25.00%]; Osaka Gas Australia [1.25%]; Tokyo Gas Australia [1.00%]; JERA Australia [0.42%]
parent [0.4733 0.25   0.0042 0.0125 0.25   0.01  ]
Chevron [47.33%]; ExxonMobil [25.00%]; Shell [25.00%]; Osaka Gas [1.25%]; Tokyo Gas [1.00%]; JERA [0.42%]
T033404

                 OwnerFractions
Owners                         
Leif Höegh & Co             1.0
owner [1.]
Leif Höegh & Co [100.00%]
parent [1.]
Leif Höegh & Co [100.00%]
T065400

                          OwnerFractions
Owners                                  
Sabine Pass Liquefaction             1.0
owner [1.]
Sabine P

In [61]:
owner_parent_strings_df.OwnerString[20]

'CNTIC VPower [unknown %]'

# write out data as Excel file

In [11]:
now_string = datetime.datetime.now().strftime('%Y-%m-%d')
owner_parent_strings_df[['OwnerString','ParentString','OwnerList','ParentList']].to_excel('GEM-terminals-owner-parent-strings-'+now_string+'.xlsx')
#owner_parent_strings_df.to_excel('GEM-terminals-owner-parent-strings-'+now_string+'.xlsx')