# owner-parent string formatting code

NOTE: owner/parent research is incomplete; presume all owners are parents if we haven't updated them yet

This script will only be successful if all 'parents' identified in sheet (2/3) are included in the parent metadata on sheet (3/3)

In [1]:
import pandas
import pygsheets
import datetime
import numpy
import xarray
import sparse

In [2]:
#fuel_type = 'Gas'
#fuel_type = 'Oil'
fuel_type = 'Oil-and-Gas'

credentials_directory = '/Users/baird/Dropbox/_google-api/'
gc = pygsheets.authorize(client_secret=credentials_directory+'client_secret.json')

spreadsheet = gc.open_by_key('1foPLE6K-uqFlaYgLPAUxzeXfDO5wOOqE7tibNHeqTek')
gas_pipes = spreadsheet.worksheet('title', 'Gas pipelines').get_as_df(start='A2')
oil_pipes = spreadsheet.worksheet('title', 'Oil/NGL pipelines').get_as_df(start='A2')

pipes_dict_df = spreadsheet.worksheet('title', 'Data dictionary').get_as_df()

if fuel_type == 'Gas':
    pipes_df_orig = gas_pipes.copy() #pandas.concat([oil_pipes, gas_pipes], ignore_index=True)
if fuel_type == 'Oil':
    pipes_df_orig = oil_pipes.copy()
if fuel_type == 'Oil-and-Gas':
    pipes_df_orig = pandas.concat([oil_pipes, gas_pipes], ignore_index=True)

pipes_df_orig = pipes_df_orig.loc[pipes_df_orig.Wiki!='']
pipes_df_orig.replace('--', numpy.nan, inplace=True)

In [3]:
owners_df_orig = spreadsheet.worksheet('title', 'Pipeline operators/owners (1/3)').get_as_df(start='A2')
owners_df_orig = owners_df_orig.loc[owners_df_orig.ProjectID!='']
owners_df_orig = owners_df_orig.loc[owners_df_orig.Wiki!='']
owners_df = owners_df_orig.replace('',numpy.nan)

owner_parent_links_df = spreadsheet.worksheet('title', 'Owner–parent relationships (2/3)').get_as_df(start='A2')
# only keep the owners with a checked relationship
owner_parent_links_df = owner_parent_links_df.loc[owner_parent_links_df['Parent–Owner Relationship Checked?']=='yes']
owner_parent_links_df.replace('',numpy.nan,inplace=True)

parents_df = spreadsheet.worksheet('title', 'Parent metadata (3/3)').get_as_df(start='A2')
parents_df = parents_df.loc[parents_df.Parent!='']

owners_df.set_index('ProjectID', inplace=True)
owner_parent_links_df.set_index('Owner', inplace=True)
parents_df.set_index('Parent', inplace=True)

# ****************************************

## create list of owner and parent column names
owner_pct_col_names = []
owner_col_names = []

parent_pct_col_names = []
parent_col_names = []

for num in range(1,11+1):
    owner_pct_col = f'Owner{num}%'
    owner_pct_col_names.append(owner_pct_col)
    
    owner_col = f'Owner{num}'
    owner_col_names.append(owner_col)
    
    parent_pct_col = f'Parent{num}%'
    parent_pct_col_names.append(parent_pct_col)
    
    parent_col = f'Parent{num}'
    parent_col_names.append(parent_col)

# ****************************************
## fill in missing parent info by borrowing owner info
## for example, if we don't have parent info, presume owner is parent for now...
owners_FULL_set = owners_df[owner_col_names].stack().dropna().unique().tolist() # from owners_df
owners_researched_set = list(set(owner_parent_links_df.index.to_list()))#+['Unknown'] # only existing owners, plus 'Unknown'
owners_diff = list(set(owners_FULL_set)-set(owners_researched_set))
owners_diff.append('Unknown')

# update owner_parent_links_df with these extra owners
owner_parent_links_df = pandas.concat([owner_parent_links_df, pandas.DataFrame(index=owners_diff, columns=owner_parent_links_df.columns)])
owner_parent_links_df['Parent1'].loc[owners_diff] = owners_diff
owner_parent_links_df['Parent1%'].loc[owners_diff] = '100.00%'

# ****************************************
# update parents_df with these as well
# note countries will be unknkown...
parents_set = list(set(parents_df.index.to_list()))
parents_diff = list(set(owners_diff)-set(parents_set))
parents_diff.append('Unknown')
parents_df = pandas.concat([parents_df, pandas.DataFrame(numpy.nan, index=parents_diff, columns=parents_df.columns)])

In [4]:
projectid_set = list(set(owners_df.index.to_list()))

## make a giant xarray Dataset to house the parent-owner-project_id information

In [5]:
parents_da = xarray.DataArray(data=parents_df.index.unique().tolist())#owner_parent_links_df
owners_da = xarray.DataArray(data=owner_parent_links_df.index.unique().tolist())
projectid_da = xarray.DataArray(data=projectid_set)

this causes the kernel to die sometimes, and is too large of a list:

In [6]:
#owner_fraction = numpy.empty((parents_da.size, owners_da.size, projectid_da.size), float)*numpy.nan
#owner_fraction = numpy.full((parents_da.size, owners_da.size, projectid_da.size), 0)
owner_fraction = sparse.COO((parents_da.size, owners_da.size, projectid_da.size), fill_value=0)

print('done')
#print(owner_fraction.nbytes)
#parent_fraction = numpy.empty((parents_da.size, owners_da.size, projectid_da.size), float)
parent_fraction = numpy.full((parents_da.size, owners_da.size, projectid_da.size), numpy.nan)
print('done')
#parent_country = numpy.empty((parents_da.size, owners_da.size, projectid_da.size), str)
#length_in_km = numpy.empty((parents_da.size, owners_da.size, projectid_da.size), float)
owner_parent_tf = numpy.empty((parents_da.size, owners_da.size, projectid_da.size), bool)
print('done')

#owner_fraction[:] = numpy.nan
#print('done')
#parent_fraction[:] = numpy.nan
#print('done')

owners_not_accounted_for = []

done
done
done


In [7]:
poc_ds = xarray.Dataset(data_vars = dict(
                                OwnerFraction=(['Parent','Owner','ProjectID'], owner_fraction),
                                ParentFraction=(['Parent','Owner','ProjectID'], parent_fraction),
                                #ParentCountry=(['Parent','Owner','ProjectID'], parent_country),
                                #LengthInKm=(['Parent','Owner','ProjectID'], length_in_km),
                                OwnerParentTF=(['Parent','Owner','ProjectID'], owner_parent_tf)),
               coords = dict(
                   Parent=(['Parent'], parents_da.values),
                   Owner=(['Owner'], owners_da.values),
                   ProjectID=(['ProjectID'], projectid_da.values)))

## fill in the Dataset

In [45]:
# iterate through owners_df
# store in the big poc_ds dataset the fractions and True/False
count = 0
for project_id,row in owners_df.iterrows():
    #if count>20:
    #    break
    owner_list = row[owner_col_names].dropna().tolist()
    #print(owner_list)
    owner_pct_vals = row[owner_pct_col_names].str.strip('%').astype('float').array/100.
    #print(owner_pct_vals)
    # if no owner or parent info, record 'Unknown' as owner/parent, nans as percent owhership
    if owner_list==[]:
        owner='Unknown'
        parent='Unknown'
        owner_list = [owner]
        parent_list = [parent]
        owner_pct_vals = [numpy.nan]
        parent_pct_vals = [numpy.nan]

    for o_idx,owner in enumerate(owner_list):
        if owner!='Unknown':
            parent_list = owner_parent_links_df.loc[owner][parent_col_names].squeeze().dropna().tolist()
            parent_pct_vals = owner_parent_links_df.loc[owner][parent_pct_col_names].str.strip('%').astype('float').array/100.
        for p_idx,parent in enumerate(parent_list):
            poc_ds['OwnerFraction'].loc[dict(Parent=parent, Owner=owner, ProjectID=project_id)] = owner_pct_vals[o_idx]
            poc_ds['ParentFraction'].loc[dict(Parent=parent, Owner=owner, ProjectID=project_id)] = parent_pct_vals[p_idx]
            poc_ds['OwnerParentTF'].loc[dict(Parent=parent, Owner=owner, ProjectID=project_id)] = True
#    count+=1

  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] = value
  array[key] 

## placeholder dataframe to eventually fill in strings

In [46]:
owner_parent_strings_df = pandas.DataFrame(index=projectid_da.values, columns=[#'OwnerList','ParentList',
                                                                    'OwnerString','ParentString'])
#                                                                    'OwnerPercentsArrayWithNans','ParentPercentsArrayWithNans',
#                                                                    'OwnerPercentsArray','ParentPercentsArray',
#                                                                    'ParentOwnrshpArray'])

## manipulate huge dataset and fill in dataframe strings

In [47]:
poc_ds

In [48]:
dict(ProjectID=project_id)

{'ProjectID': 'P5441'}

In [49]:
tf_da.shape

(1021, 1139, 4409)

In [50]:
numpy.where(tf_da.loc[dict(ProjectID=project_id)].values == True)

(array([508]), array([336]))

In [36]:
count = 1

tf_da = poc_ds['OwnerParentTF']

for project_id in projectid_da.values:
    print(count)
    if count>10:
        break
    
    # get indices in the poc_ds where the parent and owners are for this project_id
    parent_indices, owner_indices = numpy.where(tf_da.loc[dict(ProjectID=project_id)].values == True)
    #parent_indices, owner_indices = numpy.where(~poc_ds['ParentFraction'].loc[dict(ProjectID=project_id)].isnull())
    parent_indices = list(parent_indices)
    owner_indices = list(owner_indices)
    
    parent_indices_unique = list(set(parent_indices))
    owner_indices_unique = list(set(owner_indices))
    
    print(parent_indices)
    print(owner_indices) 
    owner_list_repeats = list(owners_da[owner_indices].values)
    parent_list_repeats = list(parents_da[parent_indices].values)
    
    owner_fractions_list_repeats = poc_ds['OwnerFraction'].loc[dict(ProjectID=project_id)].values[parent_indices, owner_indices]
    parent_fractions_list_repeats = poc_ds['ParentFraction'].loc[dict(ProjectID=project_id)].values[parent_indices, owner_indices]
    
    parent_fractions_list_repeats = parent_fractions_list_repeats * owner_fractions_list_repeats
    
    owner_frac_df = pandas.DataFrame({'Owners':owner_list_repeats,'OwnerFractions':owner_fractions_list_repeats})
    parent_frac_df = pandas.DataFrame({'Parents':parent_list_repeats,'ParentFractions':parent_fractions_list_repeats})
    owner_frac_df = pandas.DataFrame(owner_frac_df.groupby(by=['Owners'], dropna=False)['OwnerFractions'].sum(min_count=1))
    parent_frac_df = pandas.DataFrame(parent_frac_df.groupby(by=['Parents'], dropna=False)['ParentFractions'].sum(min_count=1))
    owner_frac_df.sort_values('OwnerFractions', ascending=False, inplace=True)
    parent_frac_df.sort_values('ParentFractions', ascending=False, inplace=True)

    parent_formatted_string = ('; ').join(list(parent_frac_df.index + (parent_frac_df['ParentFractions']*100).map(' [{:,.2f}%]'.format)))
    owner_formatted_string = ('; ').join(list(owner_frac_df.index + (owner_frac_df['OwnerFractions']*100).map(' [{:,.2f}%]'.format)))

    parent_formatted_string = parent_formatted_string.replace('nan%', 'unknown %')
    owner_formatted_string = owner_formatted_string.replace('nan%', 'unknown %')
    
    owner_parent_strings_df.loc[project_id,'OwnerString'] = owner_formatted_string
    owner_parent_strings_df.loc[project_id,'ParentString'] = parent_formatted_string
    
    owner_parent_strings_df.replace('','--',inplace=True)
    
    if count%100==0:
        print(count)
    count+=1

1
[372, 373]
[629, 635]
2
[1007]
[1138]
3
[404]
[700]
4
[333]
[70]
5
[152]
[545]
6
[333]
[70]
7
[670]
[307]
8
[152]
[545]
9
[66]
[152]
10
[846]
[894]
11


In [26]:
owner_parent_strings_df.loc[owner_parent_strings_df.index.isin(['P1554','P4439'])]

Unnamed: 0,OwnerString,ParentString
P4439,Unknown [0.00%],Unknown [unknown %]
P1554,Saibu Gas [0.00%],Saibu Gas [0.00%]


# write out data as Excel file

In [19]:
now_string = datetime.datetime.now().strftime('%Y-%m-%d')
owner_parent_strings_df[['OwnerString','ParentString','OwnerList','ParentList']].to_excel('GEM-pipelines-owner-parent-strings-'+now_string+'.xlsx')
#owner_parent_strings_df.to_excel('GEM-terminals-owner-parent-strings-'+now_string+'.xlsx')