In [1]:
import pandas
pandas.set_option("display.max_rows", 50, "display.max_columns", 50)

import numpy
import pygsheets
import datetime

In [2]:
pandas.io.formats.excel.ExcelFormatter.header_style = None

#fuel_type = 'Gas'
#fuel_type = 'Oil'
fuel_type = 'NGL'

if fuel_type=='Gas':
    excel_writer = pandas.ExcelWriter('GOIT-Summary-Sheets-Gas-'+str(datetime.date.today())+'.xlsx', engine='xlsxwriter')
if fuel_type=='NGL':
    excel_writer = pandas.ExcelWriter('GOIT-Summary-Sheets-NGL-'+str(datetime.date.today())+'.xlsx', engine='xlsxwriter')
if fuel_type=='Oil':
    excel_writer = pandas.ExcelWriter('GOIT-Summary-Sheets-Oil-'+str(datetime.date.today())+'.xlsx', engine='xlsxwriter')

import Pipelines_Current dataset

In [3]:
credentials_directory = '/Users/baird/Dropbox/_google-api/'
gc = pygsheets.authorize(client_secret=credentials_directory+'client_secret.json')
spreadsheet = gc.open_by_key('1foPLE6K-uqFlaYgLPAUxzeXfDO5wOOqE7tibNHeqTek')

gas_pipes = spreadsheet.worksheet('title', 'Gas pipelines').get_as_df()
oil_pipes = spreadsheet.worksheet('title', 'Oil/NGL pipelines').get_as_df()

gas_pipes = gas_pipes.drop('WKTFormat', axis=1) # delete WKTFormat column
oil_pipes = oil_pipes.drop('WKTFormat', axis=1)
pipes_df_orig = pandas.concat([oil_pipes, gas_pipes], ignore_index=True)

#get other relevant sheets
country_ratios_df = spreadsheet.worksheet('title', 'Country ratios by pipeline').get_as_df()
owners_df_orig = spreadsheet.worksheet('title', 'Pipeline operators/owners (1/3)').get_as_df()

# remove empty cells for pipes, owners
pipes_df_orig = pipes_df_orig[pipes_df_orig['PipelineName']!='']
pipes_df_orig = pipes_df_orig[pipes_df_orig['Wiki']!='']
owners_df_orig = owners_df_orig[owners_df_orig['ProjectID']!='']
owners_df_orig = owners_df_orig[owners_df_orig['Wiki']!='']
owners_df_orig = owners_df_orig[owners_df_orig.Status!='N/A']

owners_df_orig.set_index('ProjectID', inplace=True)

In [4]:
country_ratios_df.replace('--', numpy.nan, inplace=True)

owners_df_orig.replace('',numpy.nan,inplace=True)
owners_df_orig.replace('--',numpy.nan,inplace=True)


pipes_df_orig.replace('--',numpy.nan,inplace=True)

In [5]:
region_df_orig = spreadsheet.worksheet('title', 'Region dictionary').get_as_df()

In [6]:
region_df_eu = region_df_orig.loc[region_df_orig['EuropeanUnion']=='Yes']
region_df_egt = region_df_orig.loc[region_df_orig['EuroGasTracker']=='Yes']
region_df_europe = region_df_orig.loc[region_df_orig['Region']=='Europe']
region_df_eu_uk = region_df_orig.loc[(region_df_orig['Region']=='Europe') | 
                                          (region_df_orig['Country'].isin(['United Kingdom','Israel']))]
#region_df_global = region_df_orig.copy()

In [7]:
region_df_touse = region_df_orig.copy()

### create country-specific dataframes for region, country_ratios_df, owners_df

In [8]:
country_ratios_df_touse = country_ratios_df.copy()#.loc[country_ratios_df['Country'].str.contains(
                                           # '|'.join(region_df_touse['Country'].tolist()))]

owners_df_touse = owners_df_orig.copy()#.loc[owners_df_orig['Countries'].str.contains(
                                        #    '|'.join(region_df_touse['Country'].tolist()))]

pipes_df_touse = pipes_df_orig.copy()#loc[pipes_df_orig['Countries'].str.contains(
                                    #        '|'.join(region_df_touse['Country'].tolist()))]

### set up info to analyze Owners tab

In [9]:
# get a set list of all owners
owner_column_list = ['Owner1', 
                     'Owner2', 
                     'Owner3', 
                     'Owner4', 
                     'Owner5', 
                     'Owner6', 
                     'Owner7', 
                     'Owner8', 
                     'Owner9', 
                     'Owner10', 
                     'Owner11']

percent_column_list = ['Owner1%', 
                     'Owner2%', 
                     'Owner3%', 
                     'Owner4%', 
                     'Owner5%', 
                     'Owner6%', 
                     'Owner7%', 
                     'Owner8%', 
                     'Owner9%', 
                     'Owner10%', 
                     'Owner11%']

### sum MergedKmByCountry and MergedKmByRegion

In [10]:
status_list = ['Proposed', 
               'Construction', 
               'Shelved', 
               'Cancelled', 
               'Operating', 
               'Idle', 
               'Mothballed', 
               'Retired']
country_list = sorted(list(set(country_ratios_df_touse['Country'])))
region_list = sorted(list(set(country_ratios_df_touse['Region'])))

In [11]:
excel_status_list = ['Proposed', 
                     'Construction', 
                     'In Development (Proposed + Construction)', 
                     'Shelved', 
                     'Cancelled', 
                     'Operating', 
                     'Idle', 
                     'Mothballed', 
                     'Retired']

In [12]:
country_ratios_df_subset = country_ratios_df_touse.copy()[country_ratios_df_touse['Fuel']==fuel_type]

km_by_country = pandas.DataFrame(columns=status_list, index=country_list)
km_by_region = pandas.DataFrame(columns=status_list, index=region_list)

print('===country-level calculations===')
for status in status_list:
    print(status)
    country_ratios_df_subset_status = country_ratios_df_subset[country_ratios_df_subset['Status']==status]
    km_by_country[status] = country_ratios_df_subset_status.groupby('Country')['MergedKmByCountry'].sum()

print('===regional calculations===')
for status in status_list:
    print(status)
    country_ratios_df_subset_status = country_ratios_df_subset[country_ratios_df_subset['Status']==status]
    km_by_region[status] = country_ratios_df_subset_status.groupby('Region')['MergedKmByCountry'].sum()

# fille NaN with 0.0
km_by_region = km_by_region.fillna(0)
km_by_country = km_by_country.fillna(0)

km_by_region['In Development (Proposed + Construction)'] = km_by_region[['Proposed','Construction']].sum(axis=1)
km_by_country['In Development (Proposed + Construction)'] = km_by_country[['Proposed','Construction']].sum(axis=1)

km_by_country = km_by_country[excel_status_list]
km_by_region = km_by_region[excel_status_list]

km_by_region.index.name = 'Region'
km_by_country.index.name = 'Country'

totals_row = km_by_region.sum(axis=0)
totals_row.name = 'Total'
km_by_region = km_by_region.append(totals_row)

totals_row = km_by_country.sum(axis=0)
totals_row.name = 'Total'
km_by_country = km_by_country.append(totals_row)

km_by_region.to_excel(excel_writer, 'Kilometers by region')
km_by_country.to_excel(excel_writer, 'Kilometers by country')

===country-level calculations===
Proposed
Construction
Shelved
Cancelled
Operating
Idle
Mothballed
Retired
===regional calculations===
Proposed
Construction
Shelved
Cancelled
Operating
Idle
Mothballed
Retired


In [13]:
km_by_region

Unnamed: 0_level_0,Proposed,Construction,In Development (Proposed + Construction),Shelved,Cancelled,Operating,Idle,Mothballed,Retired
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Australia and New Zealand,0.0,0.0,0.0,0.0,0.0,1453.0,0.0,0.0,0.0
East Asia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Eurasia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Europe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Latin America and the Caribbean,57.11,0.0,57.11,0.0,0.0,560.0,0.0,0.0,0.0
Middle East and North Africa,0.0,300.0,300.0,0.0,0.0,2363.0,0.0,0.0,0.0
North America,858.79,563.27,1422.06,0.0,1665.68,21027.35,0.0,0.0,0.0
SE Asia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
South Asia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sub-Saharan Africa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### create an excel file with all unique owners, not specific to fuel type

In [14]:
owner_list = []
for column in owner_column_list:
    owner_list+=list(owners_df_orig[column])
owner_list = list(set(owner_list))
# remove empty (no owner)
owner_list.remove(numpy.nan)
unique_owner_list = sorted(owner_list)

#pandas.Series(unique_owner_list).to_excel('AllFuels'+'UniqueOwnersList.xlsx')

## pipeline km by parent company (owner) and project status

In [16]:
set(owners_df_touse.index) ^ set(pipes_df_orig.ProjectID)

{'P3162', 'P3599', 'P3600', 'P3656', 'P3672'}

In [17]:
owners_df_touse[owners_df_touse.index.duplicated()]

Unnamed: 0_level_0,PipelineNetworkContainer,PipelineName,SegmentName,Countries,MergedKmByPipeline,Fuel,Status,Wiki,AggregateOwners,Researcher,LastUpdated,Notes,Operator,OperatorLocalLanguage,Owner1,Owner1%,Owner2,Owner2%,Owner3,Owner3%,Owner4,Owner4%,Owner5,Owner5%,Owner6,Owner6%,Owner7,Owner7%,Owner8,Owner8%,Owner9,Owner9%,Owner10,Owner10%,Owner11,Owner11%,OwnerQCC(业主单位),OwnerQCC(业主单位)%,Percentage Verification
ProjectID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1


In [18]:
owners_df_subset = owners_df_touse.loc[owners_df_touse['Fuel']==fuel_type]

owners_df_subset['Owner1'].replace(numpy.nan, '*** Unknown owner ***', inplace=True)

##################################################

owner_list_subset = []
for column in owner_column_list:
    owner_list_subset+=list(owners_df_subset[column])
owner_list_subset = list(set(owner_list_subset))
# remove empty (no owner)
owner_list_subset.remove(numpy.nan)
unique_owner_list_subset = sorted(owner_list_subset)

##################################################
# convert percents to fractions
##################################################
owners_df_fractions = owners_df_subset.copy()

for col in percent_column_list:
    #print(col)
    owners_df_fractions[col] = owners_df_fractions[col].apply(lambda x: numpy.nan if x in [numpy.nan] 
                                          else x[:-1]).astype(float)/100

##################################################
# create km count by owner, status
##################################################
no_owner_info_count = 0
owners_km_by_status_df = pandas.DataFrame(0, index=unique_owner_list_subset, columns=status_list, dtype=object)

for status in status_list:
    
    owners_df_temporary = owners_df_fractions.loc[owners_df_fractions['Status']==status]
    
    for idx,row in owners_df_temporary.iterrows():
        
        ### how many owners are there?
        row_owners = list(row[owner_column_list])
        row_owners = [i for i in row_owners if str(i)!='nan']
        n_owners = row_owners.__len__()
        row_fractions = list(row[percent_column_list])
        row_fractions = [i for i in row_fractions if str(i)!='nan']

        ### if there are no owners listed, continue to next loop iteration
        if n_owners==0:
            no_owner_info_count+=1
            continue # doesn't complete the rest of the ifs

        ### now if row fractions is an empty list, create equal fractions list instead
        if row_fractions==[]:
            row_fractions = [1/n_owners]*n_owners
        elif row_fractions.__len__()!=row_owners.__len__():
            print(row_owners, row_fractions)

        pipe_length = row['MergedKmByPipeline']
        # for pipelines where the known length is missing, and there's no route:
        #if numpy.isnan(pipe_length):
        #    print(row_owners)
        #    print(row.PipelineName, row.SegmentName)
        #    print()
        
        owner_km_fractions = numpy.array(row_fractions)*pipe_length

        for owner_idx in range(n_owners):
            #print(row_owners)
            owners_km_by_status_df.loc[row_owners[owner_idx]][status]+=owner_km_fractions[owner_idx]

owners_km_by_status_df.index.name = 'Owner'
#owners_km_by_status_df = owners_km_by_status_df*numpy.nan
owners_km_by_status_df['In Development (Proposed + Construction)'] = owners_km_by_status_df[['Proposed','Construction']].sum(axis=1)
owners_km_by_status_df = owners_km_by_status_df[excel_status_list]

# rearrange the order of the columns for output
owners_km_by_status_df = owners_km_by_status_df[excel_status_list]

totals_row = owners_km_by_status_df.sum(axis=0)
totals_row.name = 'Total'
owners_km_by_status_df = owners_km_by_status_df.append(totals_row)

#owners_km_by_status_df = owners_km_by_status_df.astype(int)
owners_km_by_status_df.to_excel(excel_writer, 'Kilometers by owner')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [19]:
owners_km_by_status_df

Unnamed: 0_level_0,Proposed,Construction,In Development (Proposed + Construction),Shelved,Cancelled,Operating,Idle,Mothballed,Retired
Owner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
APA Group,0.0,0.0,0.0,0,0.0,1375.0,0,0,0
BHP Billiton Group,0.0,0.0,0.0,0,0.0,39.0,0,0,0
Baymark Pipeline LLC,0.0,0.0,0.0,0,0.0,146.450304,0,0,0
Chaparral Pipeline Company LLC,0.0,0.0,0.0,0,0.0,1926.13824,0,0,0
Chevron,0.0,0.0,0.0,0,0.0,168.98112,0,0,0
DCP Midstream LLC,0.0,0.0,0.0,0,0.0,1158.72768,0,0,0
EIP Investment,0.0,0.0,0.0,0,0.0,53.875,0,0,0
EPIC Midstream Holdings,0.0,0.0,0.0,0,0.0,506.94336,0,0,0
"EPIC Y-Grade, LP",0.0,0.0,0.0,0,0.0,210.824064,0,0,0
Enbridge Frontier Inc,170.0,0.0,170.0,0,0.0,0.0,0,0,0


In [20]:
owners_km_by_status_df

Unnamed: 0_level_0,Proposed,Construction,In Development (Proposed + Construction),Shelved,Cancelled,Operating,Idle,Mothballed,Retired
Owner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
APA Group,0.0,0.0,0.0,0,0.0,1375.0,0,0,0
BHP Billiton Group,0.0,0.0,0.0,0,0.0,39.0,0,0,0
Baymark Pipeline LLC,0.0,0.0,0.0,0,0.0,146.450304,0,0,0
Chaparral Pipeline Company LLC,0.0,0.0,0.0,0,0.0,1926.13824,0,0,0
Chevron,0.0,0.0,0.0,0,0.0,168.98112,0,0,0
DCP Midstream LLC,0.0,0.0,0.0,0,0.0,1158.72768,0,0,0
EIP Investment,0.0,0.0,0.0,0,0.0,53.875,0,0,0
EPIC Midstream Holdings,0.0,0.0,0.0,0,0.0,506.94336,0,0,0
"EPIC Y-Grade, LP",0.0,0.0,0.0,0,0.0,210.824064,0,0,0
Enbridge Frontier Inc,170.0,0.0,170.0,0,0.0,0.0,0,0,0


### pipeline km by start year, type

In [21]:
pipes_started_eu = pipes_df_touse.copy()
#pipes_started_eu['StartYearLatest'].replace(numpy.nan,'',inplace=True)

if fuel_type == 'Gas':
    pipes_started_eu = pipes_started_eu[(pipes_started_eu['Status'].isin(['Operating'])) &
                              (pipes_started_eu['Fuel']=='Gas')]
if fuel_type == 'Oil':
    pipes_started_eu = pipes_started_eu[(pipes_started_eu['Status'].isin(['Operating'])) &
                              (pipes_started_eu['Fuel']=='Oil')]
if fuel_type == 'NGL':
    pipes_started_eu = pipes_started_eu[(pipes_started_eu['Status'].isin(['Operating'])) &
                              (pipes_started_eu['Fuel']=='NGL')]

pipes_started_eu_sum = pipes_started_eu.groupby('StartYearEarliest')['LengthMergedKm'].sum()

In [22]:
if fuel_type == 'Gas':
    km_by_start_year = pandas.DataFrame(index=list(range(1980,2022)), columns=['Gas pipeline km'])
    km_by_start_year.index.name = 'Start year'
    km_by_start_year['Gas pipeline km'] = pipes_started_eu_sum
    km_by_start_year.replace(numpy.nan,0,inplace=True)

if fuel_type == 'Oil':
    km_by_start_year = pandas.DataFrame(index=list(range(1980,2022)), columns=['Oil pipeline km'])
    km_by_start_year.index.name = 'Start year'
    km_by_start_year['Oil pipeline km'] = pipes_started_eu_sum
    km_by_start_year.replace(numpy.nan,0,inplace=True)

if fuel_type == 'NGL':
    km_by_start_year = pandas.DataFrame(index=list(range(1980,2022)), columns=['NGL pipeline km'])
    km_by_start_year.index.name = 'Start year'
    km_by_start_year['NGL pipeline km'] = pipes_started_eu_sum
    km_by_start_year.replace(numpy.nan,0,inplace=True)

totals_row = km_by_start_year.sum(axis=0)
totals_row.name = 'Total'
km_by_start_year = km_by_start_year.append(totals_row)

km_by_start_year.to_excel(excel_writer, 'Kilometers by start year')

## save excel file

In [23]:
excel_writer.save()

# calculating stats for landing page

In [24]:
# number of projects tracked in total
print(pipes_df_orig.loc[pipes_df_orig['Fuel']=='Oil'].shape[0], 'oil pipeline projects tracked')
print(pipes_df_orig.loc[pipes_df_orig['Fuel']=='Oil']['LengthMergedKm'].sum(), 'km tracked')

911 oil pipeline projects tracked
403468.61 km tracked


In [25]:
# number of projects tracked in total
print(pipes_df_orig.loc[pipes_df_orig['Fuel']=='NGL'].shape[0], 'NGL pipeline projects tracked')
print(pipes_df_orig.loc[pipes_df_orig['Fuel']=='NGL']['LengthMergedKm'].sum(), 'km tracked')

57 NGL pipeline projects tracked
28848.2 km tracked


In [None]:
pipes_df_oil = pipes_df_orig.loc[pipes_df_orig['Fuel']=='Oil']
pipes_df_oil.replace('',numpy.nan,inplace=True)

pipes_df_ngl = pipes_df_orig.loc[pipes_df_orig['Fuel']=='NGL']
pipes_df_ngl.replace('',numpy.nan,inplace=True)

In [None]:
stats_list = [
    'StartCountry',
    'EndCountry',
    'Fuel',
    'Status',
    'StartRegion',
    'EndRegion',
    'LengthEstimateKm',
    'LengthKnownKm',
    'Owner',
    'Capacity',
    'StartYearEarliest']

for col in stats_list:
    print(col)
    print(pipes_df_oil[col].isnull().sum())
    print(pipes_df_oil[col].isnull().sum()/pipes_df_oil[col].count() * 100.)
    print()

In [None]:
for col in stats_list:
    print(col)
    print(pipes_df_ngl[col].isnull().sum())
    print(pipes_df_ngl[col].isnull().sum()/pipes_df_ngl[col].count() * 100.)
    print()