# Import packages and data

In [1]:
# note that the order of points in GeoPandas is longitude, latitude 
# (opposite order from that of many data sets)
import geopandas
import shapely.geometry #import Point, LineString, MultiLineString, Polygon
import shapely.ops #import linemerge, cascaded_union, nearest_points
import pyproj
import pandas
import time
import numpy

import pygsheets

from colab

from colab

from colab

import EEZ file

In [2]:
# from https://www.marineregions.org/downloads.php
# in the section "Marine and land zones: the union of world country boundaries and EEZ's"
eez_file = '../data/EEZ_land_union_v2_201410/EEZ_land_v2_201410.shp'

In [3]:
eez_and_land_boundaries = geopandas.read_file(eez_file)
eez_and_land_boundaries = eez_and_land_boundaries.set_index('Country')
#eez_4087 = eez_and_land_boundaries.to_crs('epsg:4087')

In [4]:
# create one blob for all world land and EEZ boundaries, using Shapely function cascaded_union 
# whatever is left out is, presumably, international waters
# this is used below to determine whether any parts of pipelines are in international waters
world_eez_and_land_boundaries = shapely.ops.cascaded_union(eez_and_land_boundaries['geometry'])

#check type, should be multipolygon object
type(world_eez_and_land_boundaries)

shapely.geometry.multipolygon.MultiPolygon

# Import and clean data

In [5]:
# create a copy of the working/finalized pipelines dataset into the below folder as an Excel file
#path = '/content/drive/Shareddrives/GEM Shared Drive/Projects/Fossil Infrastructure (GFIT)/Pipelines/Pipeline Data Analysis/EEZ_land_union_v2_201410/'
#pipe = pd.read_excel(path + 'pipe_082021.xlsx')

credentials_directory = '/Users/baird/Dropbox/_google-api/'
gc = pygsheets.authorize(client_secret=credentials_directory+'client_secret.json')
spreadsheet = gc.open_by_key('1foPLE6K-uqFlaYgLPAUxzeXfDO5wOOqE7tibNHeqTek')
#spreadsheet[1] "Gas Pipelines" tab is the second index
gas_pipes = spreadsheet[1].get_as_df()
oil_pipes = spreadsheet[3].get_as_df()
#owners = spreadsheet[2].get_as_df()

gas_pipes = gas_pipes.drop('WKTFormat', axis=1) # delete WKTFormat column
oil_pipes = oil_pipes.drop('WKTFormat', axis=1)

#pipe = spreadsheet[1].get_as_df()
#pipe = pipe.drop('WKTFormat', axis=1)

## Specify Oil/NGL or Gas

In [6]:
#type = 'Oil'
#type = 'Gas'
type = 'Oil_and_Gas'

if type=='Oil':
    pipe = oil_pipes
elif type=='Gas':
    pipe = gas_pipes
elif type=='Oil_and_Gas':
    pipe = pandas.concat([oil_pipes, gas_pipes], ignore_index=True)

In [7]:
#clean pipelines dataset
pipe['PipelineName'] = pipe['PipelineName']

# get pipeline list, import as df
pipe['PipelineName'] = pipe['PipelineName'].str.strip()
pipe['SegmentName'] = pipe['SegmentName'].str.strip()
pipe['ProjectID'] = pipe['ProjectID'].str.strip()

# clean up column 'Fuel'
pipe['Fuel'] = pipe['Fuel'].str.strip()

# clean up column 'Route'
pipe['Route'] = pipe['Route'].str.strip()

# replace empty entries in Route with nan
pipe['Route'] = pipe['Route'].replace(r'^\s*$', numpy.nan, regex=True)

# replace empty entrys in LengthKnownKm with nan
pipe['LengthKnownKm'] = pipe['LengthKnownKm'].replace(r'^\s*$', numpy.nan, regex=True)

# drop rows with no routes available (routes were unavailable or not applicable)
#pipe = pipe.loc[pipe['Route'].isnull()==False]
#pipe = pipe[pipe['Route'].notna()]
pipe = pipe[pipe['Route'] != 'Unavailable']
pipe = pipe[pipe['Route'] != 'Capacity expansion only']
pipe = pipe[pipe['Route'] != 'Bidirectionality upgrade only']
pipe.shape

# REMOVED

# drop empty columns that come out as 'Unnamed'
#for col in pipe.columns:
#    if 'Unnamed: ' in col:
#        print(col)
#        pipe = pipe.drop(col, axis=1)
        
# replace blank 'Status' values by assuming 'Operating' status
#for col in ['Status']:
#    pipe[col] = pipe[col].fillna('Operating')
    
# drop rows with no name of pipeline (empty rows)
#pipe = pipe.loc[pipe['PipelineName'].isnull()==False]

(2940, 55)

In [8]:
pipe.columns

Index(['PipelineName', 'SegmentName', 'Wiki', 'Fuel', 'Countries',
       'Researcher', 'LastUpdated', 'ProjectID', 'OtherNames', 'Status',
       'Owner', 'Cost', 'CostUnits', 'ProposalYear', 'ConstructionYear',
       'StartYear1', 'StartYear2', 'StartYear3', 'StopYear', 'Capacity',
       'CapacityUnits', 'CapacityBOEd', 'LengthKnown', 'LengthKnownUnits',
       'LengthKnownKm', 'LengthEstimateKm', 'LengthMergedKm', 'Diameter',
       'DiameterUnits', 'Source', 'StartLocation', 'StartPrefecture/District',
       'StartState/Province', 'StartCountry', 'StartRegion', 'EndLocation',
       'EndPrefecture/District', 'EndState/Province', 'EndCountry',
       'EndRegion', 'NumberOfCountries', 'Route', 'RouteMapURL', 'Opposition',
       'FID', 'FIDYear', 'WriteDown', 'EuropeTracker', 'PCI',
       'OtherLanguagePipelineName', 'OtherLanguageSegmentName',
       'OtherLanguageWikiPage', 'ResearcherNotes', 'TotalNetworkLength',
       'TotalNetworkLengthUnits'],
      dtype='object')

# Length Calculation Functions

## convert gfit to linestring

In [9]:
def convert_gfit_to_linestring(coord_str, pipeline_name, segment_name, project_id, status, fuel, capacity, length):
    '''
    Takes string from GFIT column of coordinates for a single pipeline,
    converts that string into Shapely LineString or MultiLinestring for processing.
    '''

    #print(coord_str)
    if ':' in coord_str and ';' not in coord_str:
        # simple geometry; no branching
        # create nested list of lists, separating on colons        
        coord_list = coord_str.split(':')
        
        coord_list_tuples = []
        
        # non-branched pipeline (nested list with one level)
        # convert nested list of lists to list of tuples
        for element in coord_list:
            element_tuple = (float(element.split(',')[1]), 
                             float(element.split(',')[0]))
            coord_list_tuples.append(element_tuple)
            
        pipeline = shapely.geometry.LineString(coord_list_tuples)

    elif ':' in coord_str and ';' in coord_str:
        # create a nested list of lists, separating on semicolons
        coord_list = coord_str.split(';')
        
        # create a second level of nesting, separating on colons
        coord_list = [x.split(':') for x in coord_list]
        
        # branched pipeline (nested list with two levels)
        pipeline_ls_all = []
        
        for nested_list in coord_list:
            coord_list_tuples = []
            
            for element in nested_list:
                element_tuple = (float(element.split(',')[1]), 
                                 float(element.split(',')[0]))
                coord_list_tuples.append(element_tuple)
                
            # process coord_list_tuples
            try:
                pipeline_ls = shapely.geometry.LineString(coord_list_tuples)
                pipeline_ls_all.append(pipeline_ls)
            except:
                print(f"Exception for coord_list_tuples: {coord_list_tuples}") # for db
                pass
            
        pipeline = shapely.geometry.MultiLineString(pipeline_ls_all)
        
    else:
        # create empty MultiLineString; coordinates were missing or misformatted
        pipeline = shapely.geometry.MultiLineString([])
        
        print(f'Missing or misformatted coordinates for {pipeline_name} - {segment_name}')
        
    return pipeline

## pipeline total length and wiggle

In [10]:
def pipeline_total_length_and_wiggle(pipes_df):
    '''
    Iterate through each pipeline, calculating the total length and wiggle factor.
    
    Modifies the main df that was function argument, returning modified version.
    '''
    
    mask_route_1 = pipes_df['Route'].str.contains(',')
    mask_route_2 = pipes_df['Route'].str.contains(':')
    pipes_with_route = pipes_df.loc[(mask_route_1) & (mask_route_2)]
    
    for row in pipes_with_route.index:
        # get string with coordinates for route, convert to LineString (or MultiLineString)
        pipeline_name = pipes_with_route.at[row, 'PipelineName']
        segment_name = pipes_with_route.at[row, 'ProjectID']
        project_id = pipes_with_route.at[row, 'SegmentName']
        pipeline_str = pipes_with_route.at[row, 'Route']
        status = pipes_with_route.at[row, 'Status']
        fuel = pipes_with_route.at[row, 'Fuel']
        capacity = pipes_with_route.at[row, 'CapacityBOEd'] 
        length = pipes_with_route.at[row, 'LengthMergedKm']
        pipeline_ls = convert_gfit_to_linestring(pipeline_str, pipeline_name, segment_name, project_id, status, fuel, capacity, length)

        # calculate length of LineString (or MultiLineString)
        geodetic_computation = pyproj.Geod(ellps="WGS84")
        length_calc = geodetic_computation.geometry_length(pipeline_ls)/1000 # units km

        # get reported length of pipeline
        length_report = pipes_with_route.at[row, 'LengthKnownKm']

    #    if pandas.notnull(length_report):
    #        # calculate wiggle factor regardless of relationship,
    #        # whether length_report is > or < length_calc
    #        # if reported and calculated length both exist, calculate their ratio:
        try:
            wiggle_factor = length_report / length_calc
            pipes_df.at[row, 'WiggleFactor'] = wiggle_factor
        # if one doesn't exist, you get a TypeError when dividing; replace with
        except TypeError:
            print('TypeError, WiggleFactor set to 1.0')
            pipes_df.at[row, 'WiggleFactor'] = float(1)

    #    else:
    #        print('notnull)')
    #        # there was no reported length; assign wiggle_factor = 1.0
    #        pipes_df.at[row, 'WiggleFactor'] = float(1)

    return(pipes_df)

## pipeline within country

In [11]:
def pipeline_within_country(pipeline_ls, 
                            pipeline_name, 
                            segment_name, 
                            project_id, 
                            results_by_country, 
                            status, 
                            fuel, 
                            capacity, 
                            length, 
                            remainders, 
                            international):
    '''
    Iterate through all countries, to see if the specified pipeline 
    is within each country (at least partially).
    
    If there is a portion within a given country, 
    saves the country name and length of pipeline to a df
    '''
    
    pipeline_remainders = pipeline_ls # initialize
    # will progressively remove pieces of the pipeline, 
    # as they intersect with each country's land mass
    
    geodetic_computation = pyproj.Geod(ellps="WGS84") # initialize
    
    length_total = geodetic_computation.geometry_length(pipeline_ls)/1000 # units km
    
    for country in eez_and_land_boundaries.index:
        country_geom = eez_and_land_boundaries.loc[country, 'geometry']
        
        if country_geom.intersects(pipeline_ls)==True:
            pipeline_intersection = pipeline_ls.intersection(country_geom)
            pipeline_remainders = pipeline_remainders.difference(country_geom)
            
            length_per_country = geodetic_computation.geometry_length(pipeline_intersection)/1000 # units km
            length_per_country_fract = length_per_country / length_total
            
            one_result = (
                pipeline_name, 
                segment_name,
                project_id,
                country, 
                length_per_country, 
                length_per_country_fract,
                status,
                fuel,
                capacity,
                length)
            one_result_df = pandas.DataFrame(one_result).T
            one_result_df.columns = ['pipeline_name', 'segment_name', 'project_id', 'country', 
                                     'length_per_country', 'length_per_country_fract', 
                                     'status', 'fuel', 'capacity', 'length']
            
            results_by_country = results_by_country.append(
                pandas.DataFrame(one_result_df), 
                sort=False)
            
        else:
            pass

    results_by_country = results_by_country.reset_index(drop=True)
        
    if pipeline_remainders.is_empty==False:       
        remainders_length = geodetic_computation.geometry_length(pipeline_remainders)/1000 # units km
        
        if remainders_length > 0.01: # units: km
            remainders_tuple = (pipeline_name, remainders_length, pipeline_remainders)
            remainders = pandas.DataFrame(remainders_tuple).T
            remainders.columns = ['pipeline_name', 'segment_name', 'project_id', 'length', 'geometry']
        
            print(f"for {pipeline_name}, pipeline_remainders.is_empty==False") # for db
            print(f"remainders_length: {remainders_length}") # for db
        
    else:
#         print(f"for {pipeline_name}, pipeline_remainders.is_empty is NOT False") # for db
        pass
    
    # alternative method: pipeline that's in international waters (not in world_eez_and_land_boundaries)
    international_pipeline = pipeline_ls.difference(world_eez_and_land_boundaries)
    if international_pipeline.is_empty==False:
        international_length = geodetic_computation.geometry_length(international_pipeline)/1000 # units km
        international_tuple = (pipeline_name, international_length, international_pipeline)
        international = pandas.DataFrame(international_tuple).T
        international.columns = ['pipeline_name', 'length', 'geometry']
    else:
        pass
    
    return results_by_country, remainders, international

# Apply functions to data

In [12]:
#this step requires that there be no non-geometry data in the "Route" column, meaning no letters or extraneous symbols (ex: ";;", "::", "--", etc.)
pipe_gpd = geopandas.GeoDataFrame(pipe)
pipe_gpd['geometry'] = ''
for row in pipe_gpd.index:
    linestring = convert_gfit_to_linestring(
        str(pipe_gpd.at[row, 'Route']), 
        pipe_gpd.at[row, 'PipelineName'], 
        pipe_gpd.at[row, 'SegmentName'], 
        pipe_gpd.at[row, 'ProjectID'], 
        pipe_gpd.at[row, 'Status'], 
        pipe_gpd.at[row, 'Fuel'], 
        pipe_gpd.at[row, 'CapacityBOEd'], 
        pipe_gpd.at[row, 'LengthMergedKm'])
    pipe_gpd.at[row, 'geometry'] = linestring



Missing or misformatted coordinates for Midland-to-ECHO Pipeline System - Pipeline 3
Missing or misformatted coordinates for Midland-to-ECHO Pipeline System - Pipeline 4
Missing or misformatted coordinates for Nighthawk Lateral Oil Pipeline - 
Missing or misformatted coordinates for Pony Express Oil Pipeline - Pipeline Expansion Project
Missing or misformatted coordinates for Sea Port Oil Terminal (SPOT) Pipeline - 
Missing or misformatted coordinates for Seahawk Oil Pipeline - 
Missing or misformatted coordinates for Vito Oil Pipeline - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misf

In [75]:
# calculate total length and wiggle factor
pipe = pipeline_total_length_and_wiggle(pipe)

In [94]:
# calculate length by country

# get coord_str for each pipeline that has route coordinates

# choose pipeline_df to process
pipeline_df = pipe.copy()

# process the pipeline_df above
pipelines_with_routes = pipeline_df.loc[
    ~(pipeline_df['Route'].isin(['Unavailable'])) | 
    ~(pipeline_df['Route'].isnull())]

results_by_country = pandas.DataFrame(
    columns=['pipeline_name', 'segment_name', 'project_id', 'country', 'length_per_country', 'status', 'fuel', 'capacity', 'length']
)

remainders = geopandas.GeoDataFrame()
international = geopandas.GeoDataFrame()

for sel_index in pipelines_with_routes.index:
    pipeline_name = pipeline_df.at[sel_index, 'PipelineName']
    segment_name = pipeline_df.at[sel_index, 'SegmentName']
    project_id = pipeline_df.at[sel_index, 'ProjectID']

    status = pipeline_df.at[sel_index, 'Status']

    fuel = pipeline_df.at[sel_index, 'Fuel']

    capacity = pipeline_df.at[sel_index, 'CapacityBOEd']

    length = pipeline_df.at[sel_index, 'LengthMergedKm']
    
    coord_str = str(pipeline_df.at[sel_index, 'Route'])
    
    pipeline_ls = convert_gfit_to_linestring(coord_str, 
                                             pipeline_name, 
                                             segment_name, 
                                             project_id, 
                                             status, 
                                             fuel, 
                                             capacity, 
                                             length)
    
    results_by_country, remainders, international = pipeline_within_country(
        pipeline_ls, 
        pipeline_name, 
        segment_name, 
        project_id, 
        results_by_country, 
        status, 
        fuel, 
        capacity, 
        length, 
        remainders, 
        international)

Missing or misformatted coordinates for Midland-to-ECHO Pipeline System - Pipeline 3
Missing or misformatted coordinates for Midland-to-ECHO Pipeline System - Pipeline 4
Missing or misformatted coordinates for Nighthawk Lateral Oil Pipeline - 
Missing or misformatted coordinates for Pony Express Oil Pipeline - Pipeline Expansion Project
Missing or misformatted coordinates for Sea Port Oil Terminal (SPOT) Pipeline - 
Missing or misformatted coordinates for Seahawk Oil Pipeline - 
Missing or misformatted coordinates for Vito Oil Pipeline - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misformatted coordinates for  - 
Missing or misf

Missing or misformatted coordinates for Hebei–Nanjing Connecting Gas Pipeline Xuzhou Branch - Pizhou–Xuzhou Branch
Missing or misformatted coordinates for Sichuan–Shanghai Gas Pipeline - Baoying–Yancheng–Dafeng Port Branch 管线一期 (丹徒-江阴)
Missing or misformatted coordinates for Beijing–Shijiazhuang–Handan Parallel Gas Pipeline - 
Missing or misformatted coordinates for Beijing–Shijiazhuang–Handan Gas Pipeline - 
Missing or misformatted coordinates for Puyang–Fan County–Taiqian Gas Pipeline - 
Missing or misformatted coordinates for Yongqing–Baoding Parallel Gas Pipeline - Yongqing–Baoding Segment
Missing or misformatted coordinates for Yongqing–Baoding Parallel Gas Pipeline - Baoding North Segment
Missing or misformatted coordinates for Zhongxian-Wuhan Pipeline - Qianjiang–Xiangtan Branch
Missing or misformatted coordinates for Hunan Gas Pipeline Network - Changsha–Changde Gas Pipeline
Missing or misformatted coordinates for Hunan Gas Pipeline Network - Xiangtan–Hengyang Gas Pipeline
Miss

In [95]:
#print(results_by_country.head)
#print(results_by_country.shape)

# Check Mean Wiggle Factor, & Outliers

In [96]:
calc_length = pipe['LengthKnownKm'] / pipe['WiggleFactor']
wiggle_factor_weighted_mean = pipe['LengthKnownKm'].sum()/calc_length.sum()

In [97]:
pipe.loc[pipe['WiggleFactor']<0.95][['PipelineName', 'SegmentName', 'ProjectID', 'WiggleFactor', 'LengthKnownKm']]

Unnamed: 0,PipelineName,SegmentName,ProjectID,WiggleFactor,LengthKnownKm
3,Abqaiq plants-Qatif junction 3 Oil Pipeline,,P1974,0.773002,55.0
4,Abqaiq plants-Qatif junction 4 Oil Pipeline,,P1975,0.773002,55.0
5,Abqaiq plants-Qatif junction 5 Oil Pipeline,,P1976,0.773002,55.0
14,Afghanistan Oil Pipeline,,P0829,0.787438,1600.0
22,Alberta Clipper Oil Pipeline,Expansion,P2094,0.003071,4.8
34,Angola-Zambia Oil Pipeline,,P1838,0.67103,1400.0
38,Apsara Oil Pipeline,,P0887,0.073379,6.5
40,Arbuckle II Y-Grade Pipeline,,P2690,0.300688,193.0
52,Bab-Habshan–Jebel Dhana Oil Pipeline,,P0639,0.702446,79.0
69,Baltic Pipeline System 1,,P0651,0.935898,2157.0


In [98]:
pipe.columns

Index(['PipelineName', 'SegmentName', 'Wiki', 'Fuel', 'Countries',
       'Researcher', 'LastUpdated', 'ProjectID', 'OtherNames', 'Status',
       'Owner', 'Cost', 'CostUnits', 'ProposalYear', 'ConstructionYear',
       'StartYear1', 'StartYear2', 'StartYear3', 'StopYear', 'Capacity',
       'CapacityUnits', 'CapacityBOEd', 'LengthKnown', 'LengthKnownUnits',
       'LengthKnownKm', 'LengthEstimateKm', 'LengthMergedKm', 'Diameter',
       'DiameterUnits', 'Source', 'StartLocation', 'StartPrefecture/District',
       'StartState/Province', 'StartCountry', 'StartRegion', 'EndLocation',
       'EndPrefecture/District', 'EndState/Province', 'EndCountry',
       'EndRegion', 'NumberOfCountries', 'Route', 'RouteMapURL', 'Opposition',
       'FID', 'FIDYear', 'WriteDown', 'EuropeTracker', 'PCI',
       'OtherLanguagePipelineName', 'OtherLanguageSegmentName',
       'OtherLanguageWikiPage', 'ResearcherNotes', 'TotalNetworkLength',
       'TotalNetworkLengthUnits', 'geometry', 'WiggleFactor'],
 

# Clean and export results

In [99]:
results_by_country.head()

Unnamed: 0,pipeline_name,segment_name,project_id,country,length_per_country,status,fuel,capacity,length,length_per_country_fract
0,A'ershan–Saihan Tala Oil Pipeline,,P1504,China,308.650397,Operating,Oil,24575.0,360.0,1.0
1,Abadan-Ahvaz-Arak-Tehran Pipeline,,P2220,Iran,673.641518,Operating,Oil,300000.0,650.0,1.0
2,Abqaiq plants-Qatif junction 2 Oil Pipeline,,P1973,Saudi Arabia,71.151191,Operating,Oil,,71.0,1.0
3,Abqaiq plants-Qatif junction 3 Oil Pipeline,,P1974,Saudi Arabia,71.151191,Operating,Oil,,55.0,1.0
4,Abqaiq plants-Qatif junction 4 Oil Pipeline,,P1975,Saudi Arabia,71.151191,Operating,Oil,,55.0,1.0


In [100]:
# export length estimates by country and pipeline
results_by_country.rename(columns={'length_per_country':'LengthEstimateKm',
                                    'pipeline_name':'PipelineName',
                                    'segment_name':'SegmentName',
                                    'project_id':'ProjectID',
                                    'length_per_country':'LengthEstimateKm by Country',
                                    'country':'Country',
                                    'status':'Status',
                                    'fuel':'Fuel',
                                    'capacity':'Capacity',
                                    'length':'LengthEstimateKm',
                                    'length_per_country_fract':'LengthPerCountryFraction'}, inplace=True)

results_by_country['Country'].replace('United States', 'USA', inplace=True)
results_by_country['Country'].replace('Czech Republic', 'Czechia', inplace=True)

results_by_country.to_excel('Estimated_Length-Results_By_Country_'+type+'.xlsx')

In [101]:
results_by_pipeline = results_by_country.copy()[['ProjectID','LengthEstimateKm by Country']]

#results_by_pipeline = pandas.DataFrame(results_by_pipeline.groupby(by=['ProjectID'])['LengthEstimateKm'].sum())
results_by_pipeline=results_by_country.groupby("ProjectID")["LengthEstimateKm by Country"].sum()

In [102]:
results_by_pipeline.to_excel('Estimated_Length-Results_By_Pipeline_'+type+'.xlsx')