# owner-parent string formatting code

NOTE: owner/parent research is incomplete; presume all owners are parents if we haven't updated them yet

In [1]:
import pandas
import pygsheets
import datetime
import numpy
import xarray

In [32]:
gc = pygsheets.authorize(service_account_env_var='GDRIVE_API_CREDENTIALS')
spreadsheet = gc.open_by_key('1tcS6Wd-Wp-LTDpLzFgJY_RSNDnbyubW3J_9HKIAys4A')
#spreadsheet = gc.open_by_key('129b9YeQn7uAIsMgyB5uleT2QZXYxjD_UetnqwVyoRaI')

#spreadsheet[1] "Gas Pipelines" tab is the second index
terms_df_orig = spreadsheet.worksheet('title', 'Terminals').get_as_df(start='A2')
terms_df_orig.replace('--',numpy.nan,inplace=True)



In [33]:
owners_df_orig = spreadsheet.worksheet('title', 'Terminal operators/owners (1/3)').get_as_df(start='A2')
owners_df_orig = owners_df_orig.loc[owners_df_orig.ComboID!='']
owners_df_orig = owners_df_orig.loc[owners_df_orig.Wiki!='']
owners_df = owners_df_orig.replace('',numpy.nan)

owner_parent_links_df = spreadsheet.worksheet('title', 'Owner–parent relationships (2/3)').get_as_df()
# only keep the owners with a checked relationship
owner_parent_links_df = owner_parent_links_df.loc[owner_parent_links_df['Parent–Owner Relationship Checked?']=='yes']
owner_parent_links_df.replace('',numpy.nan,inplace=True)

parents_df = spreadsheet.worksheet('title', 'Parent metadata (3/3)').get_as_df(start='A2')
parents_df = parents_df.loc[parents_df.Parent!='']

owners_df.set_index('ComboID', inplace=True)
owner_parent_links_df.set_index('Owner', inplace=True)
parents_df.set_index('Parent', inplace=True)

# ****************************************

## create list of owner and parent column names
owner_pct_col_names = []
owner_col_names = []

parent_pct_col_names = []
parent_col_names = []

for num in range(1,10+1):
    owner_pct_col = f'Owner{num}%'
    owner_pct_col_names.append(owner_pct_col)
    
    owner_col = f'Owner{num}'
    owner_col_names.append(owner_col)
    
    parent_pct_col = f'Parent{num}%'
    parent_pct_col_names.append(parent_pct_col)
    
    parent_col = f'Parent{num}'
    parent_col_names.append(parent_col)

# ****************************************
# FOR CHINA
qcc_owners_df = owners_df.loc[(owners_df.Country=='China')&(~owners_df['QCCOwner(业主单位)'].isnull())]
owners_df.loc[qcc_owners_df.index,'Owner1'] = qcc_owners_df['QCCOwner(业主单位)']
owners_df.loc[qcc_owners_df.index,'Owner1%'] = '100.00%'

qcc_df_temporary = pandas.DataFrame({'Parent1':qcc_owners_df['QCCOwner(业主单位)'].unique(), 'Parent1%':'100.00%', 
                                    'Parent-Owner Relationship Checked?':'yes'}, index=qcc_owners_df['QCCOwner(业主单位)'].unique())
owner_parent_links_df = pandas.concat([owner_parent_links_df, qcc_df_temporary])

qcc_df_temporary = pandas.DataFrame({'ParentHQCountry':'China'}, index=qcc_owners_df['QCCOwner(业主单位)'].unique())
parents_df = pandas.concat([parents_df, qcc_df_temporary])

# ****************************************
## fill in missing parent info by borrowing owner info
owners_FULL_set = owners_df[owner_col_names].stack().dropna().unique().tolist() # from owners_df
owners_researched_set = list(set(owner_parent_links_df.index.to_list()))#+['Unknown'] # only existing owners, plus 'Unknown'
owners_diff = list(set(owners_FULL_set)-set(owners_researched_set))
owners_diff.append('Unknown')

# update owner_parent_links_df with these extra owners
owner_parent_links_df = pandas.concat([owner_parent_links_df, pandas.DataFrame(index=owners_diff, columns=owner_parent_links_df.columns)])
owner_parent_links_df['Parent1'].loc[owners_diff] = owners_diff
owner_parent_links_df['Parent1%'].loc[owners_diff] = '100.00%'

# ****************************************
# update parents_df with these as well
# note countries will be unknkown...
parents_set = list(set(parents_df.index.to_list()))
parents_diff = list(set(owners_diff)-set(parents_set))
#parents_diff.append('Unknown')
parents_df = pandas.concat([parents_df, pandas.DataFrame(numpy.nan, index=parents_diff, columns=parents_df.columns)])
parents_df.replace(numpy.nan, 'Unknown', inplace=True)

In [34]:
comboid_list = list(set(owners_df.index.to_list()))

# make dictionary to house parent info

In [35]:
po_dict = {}

# iterate through owners_df
# store in the big po_dict

for combo_id,row in list(owners_df.iterrows()):
    po_dict[combo_id] = {}
    
    owner_list_drop_nans = row[owner_col_names].dropna().tolist()
    owner_pct_vals = list(row[owner_pct_col_names].str.strip('%').astype('float').array/100.)[:owner_list_drop_nans.__len__()]
    
    # now go through the owner list, if it's empty create an uknown
    # if not empty, for each owner:
    #    save its percent ownership (make sure it's nan if it doesn't exist)
    #    save its list of parents (EVERY OWNER has a parent in the database)
    #    save the list of parent ownership (make sure it's a list of nans that is same length as list of parents)
    
    if owner_list_drop_nans==[]:
        owner='Unknown'
        parent='Unknown'
        owner_list = [owner] #+ [numpy.nan]*(owner_col_names.__len__()-1)
        parent_list = [parent] #+ [numpy.nan]*(parent_col_names.__len__()-1)
        owner_pct_vals = [numpy.nan]#*owner_pct_col_names.__len__()
        parent_pct_vals = [numpy.nan]#*parent_pct_col_names.__len__()
        
        # if there are no owners/parents, make them Unknown/Unknown
        po_dict[combo_id]['owner_parent_links'] = {}
        po_dict[combo_id]['owner_list'] = owner_list
        po_dict[combo_id]['owner_pct_vals'] = owner_pct_vals
        po_dict[combo_id]['owner_parent_links'][owner] = {}
        po_dict[combo_id]['owner_parent_links'][owner]['owner_pct_val'] = owner_pct_vals[0] # record the specific fraction val of the owner
        
        po_dict[combo_id]['owner_parent_links'][owner]['parent_list'] = parent_list
        po_dict[combo_id]['owner_parent_links'][owner]['parent_pct_vals'] = parent_pct_vals
        po_dict[combo_id]['owner_parent_links'][owner]['parent_hq_country_list'] = ['Unknown']
    
    else:
        po_dict[combo_id]['owner_parent_links'] = {}
        
        for o_idx,owner in enumerate(owner_list_drop_nans):
            parent_list_drop_nans = owner_parent_links_df.loc[owner][parent_col_names].squeeze().dropna().tolist()
            parent_pct_vals = list(owner_parent_links_df.loc[owner][parent_pct_col_names].str.strip('%').astype('float').array/100.)[:parent_list_drop_nans.__len__()] # only as long as parent_list

            po_dict[combo_id]['owner_list'] = owner_list_drop_nans
            po_dict[combo_id]['owner_pct_vals'] = owner_pct_vals
            
            po_dict[combo_id]['owner_parent_links'][owner] = {}
            po_dict[combo_id]['owner_parent_links'][owner]['owner_pct_val'] = owner_pct_vals[o_idx]
            
            po_dict[combo_id]['owner_parent_links'][owner]['parent_list'] = parent_list_drop_nans
            po_dict[combo_id]['owner_parent_links'][owner]['parent_pct_vals'] = parent_pct_vals
            
            try:
                po_dict[combo_id]['owner_parent_links'][owner]['parent_hq_country_list'] = parents_df.loc[parent_list_drop_nans].ParentHQCountry.tolist()
            except KeyError:
                po_dict[combo_id]['owner_parent_links'][owner]['parent_hq_country_list'] = ['Not yet recorded']
            

In [36]:
pid_list = list(po_dict.keys())

owner_parent_strings_df = pandas.DataFrame(index=pid_list, columns=[#'OwnerList','ParentList',
                                                                    'OwnerString','ParentString','ParentHQCountry'])
#                                                                    'OwnerPercentsArrayWithNans','ParentPercentsArrayWithNans',
#                                                                    'OwnerPercentsArray','ParentPercentsArray',
#                                                                    'ParentOwnrshpArray'])

In [37]:
for combo_id in comboid_list:
    
    # get list of owners
    owner_list = po_dict[combo_id]['owner_list']#.keys()
    owner_pct_vals = po_dict[combo_id]['owner_pct_vals']
    all_parents_list = []
    all_parents_normalized_pct_vals = []
    all_parents_hq_country_list = []
    
    for owner in owner_list:
        owner_pct_val = [ po_dict[combo_id]['owner_parent_links'][owner]['owner_pct_val'] ]
        parent_list = po_dict[combo_id]['owner_parent_links'][owner]['parent_list']
        parent_hq_country_list = po_dict[combo_id]['owner_parent_links'][owner]['parent_hq_country_list']
        
        parent_pct_vals = numpy.array(po_dict[combo_id]['owner_parent_links'][owner]['parent_pct_vals'])
        
        parent_normalized_pct_vals = list(parent_pct_vals * owner_pct_val)
        
        all_parents_list += parent_list
        all_parents_normalized_pct_vals += parent_normalized_pct_vals
        all_parents_hq_country_list += parent_hq_country_list
    
    #print(all_parents_normalized_pct_vals)
    #print(all_parents_hq_country_list)
    
    owner_frac_df = pandas.DataFrame({'Owners':owner_list,'OwnerFractions':owner_pct_vals})#.dropna(how='all') # drop nan rows   
    parent_frac_df = pandas.DataFrame({'Parents':all_parents_list,
                                       'ParentFractions':all_parents_normalized_pct_vals,
                                       'ParentHQCountries':all_parents_hq_country_list})
    #parent_hq_country_df = pandas.DataFrame({'Parents':all_parents_list,
    #                                         'ParentFractions':all_parents_normalized_pct_vals})#.dropna(how='all') # drop nan rows
    
    # sum any of the same owners/parents
    owner_frac_df = pandas.DataFrame(owner_frac_df.groupby(by=['Owners'], dropna=False)['OwnerFractions'].sum(min_count=1))
    parent_frac_df = pandas.DataFrame(parent_frac_df.groupby(by=['Parents'], dropna=False)['ParentFractions'].sum(min_count=1))
    #parent_hq_country_df = pandas.DataFrame(parent_frac_df.groupby(by=['Parents'], dropna=False)['ParentFractions','ParentHQCountry'].sum(min_count=1))
    
    owner_frac_df.sort_values('OwnerFractions', ascending=False, inplace=True)
    parent_frac_df.sort_values('ParentFractions', ascending=False, inplace=True)
    
    try:
        parent_hq_country_list = [parents_df.loc[p].ParentHQCountry for p in parent_frac_df.index.tolist()]
    except KeyError:
        parent_hq_country_list = []
        for p in parent_frac_df.index.tolist():
            try:
                parent_hq_country_list += [parents_df.loc[p].ParentHQCountry]
            except KeyError:
                parent_hq_country_list += ['Not yet recorded']

    parent_formatted_string = ('; ').join(list(parent_frac_df.index + (parent_frac_df['ParentFractions']*100).map(' [{:,.2f}%]'.format)))
    owner_formatted_string = ('; ').join(list(owner_frac_df.index + (owner_frac_df['OwnerFractions']*100).map(' [{:,.2f}%]'.format)))
    parent_hq_country_formatted_string = ('; ').join(parent_hq_country_list)

    parent_formatted_string = parent_formatted_string.replace('nan%', 'unknown %')
    owner_formatted_string = owner_formatted_string.replace('nan%', 'unknown %')
    
    owner_parent_strings_df.loc[combo_id,'OwnerString'] = owner_formatted_string
    owner_parent_strings_df.loc[combo_id,'ParentString'] = parent_formatted_string
    owner_parent_strings_df.loc[combo_id,'ParentHQCountry'] = parent_hq_country_formatted_string
    
    owner_parent_strings_df.replace('','--',inplace=True)

In [38]:
now_string = datetime.datetime.now().strftime('%Y-%m-%d')
owner_parent_strings_df[['OwnerString','ParentString','ParentHQCountry']].to_excel('GEM-terminals-owner-parent-strings-'+now_string+'.xlsx')
#owner_parent_strings_df.to_excel('GEM-terminals-owner-parent-strings-'+now_string+'.xlsx')