# need to fill in any routes with BOTH missing (known) length AND route (so that we can have a guesstimate for all)

# Import packages and data

In [1]:
# note that the order of points in GeoPandas is longitude, latitude 
# (opposite order from that of many data sets)

import geopandas
import shapely.geometry
import shapely.ops
import pyproj
import pandas

import time
import numpy

import pygsheets

import EEZ file

In [2]:
# from https://www.marineregions.org/downloads.php
# in the section "Marine and land zones: the union of world country boundaries and EEZ's"
eez_file = '/Users/baird/Dropbox/_gis-data/eez/EEZ_land_union_v2_201410/EEZ_land_v2_201410.shp'

In [3]:
eez_and_land_boundaries_gdf = geopandas.read_file(eez_file)
eez_and_land_boundaries_gdf = eez_and_land_boundaries_gdf.set_index('Country')
#eez_4087 = eez_and_land_boundaries.to_crs('epsg:4087')

### special cases for EEZs (Hong Kong, Macao...)

import natural earth data file to pick out hong kong, macao

In [4]:
nat_earth_file = '/Users/baird/Dropbox/_gis-data/_natural_earth_data/ne_10m_admin_0_countries/ne_10m_admin_0_countries.shp'
nat_earth_gdf = geopandas.read_file(nat_earth_file)

In [5]:
# pull out shapely geometry polygons and multipolygons
china_geom = eez_and_land_boundaries_gdf.loc[eez_and_land_boundaries_gdf.index=='China']['geometry'].values[0]
hk_geom = nat_earth_gdf.loc[nat_earth_gdf.ADMIN=='Hong Kong S.A.R.']['geometry'].values[0]
macao_geom = nat_earth_gdf.loc[nat_earth_gdf.ADMIN=='Macao S.A.R']['geometry'].values[0]

In [6]:
china_new_geom = china_geom - china_geom.intersection(hk_geom)
china_new_geom = china_new_geom - china_new_geom.intersection(macao_geom)

check that the new geometry is smaller in area

now replace the original china in EEZ file, and add Hong Kong, Macao

In [7]:
empty_row_hk = geopandas.GeoDataFrame([[numpy.nan]*eez_and_land_boundaries_gdf.columns.size],
                             columns=eez_and_land_boundaries_gdf.columns, index=['Hong Kong'])
empty_row_hk['geometry'] = hk_geom

empty_row_macao = geopandas.GeoDataFrame([[numpy.nan]*eez_and_land_boundaries_gdf.columns.size],
                             columns=eez_and_land_boundaries_gdf.columns, index=['Macao'])
empty_row_macao['geometry'] = macao_geom

# add geometries to these rows
eez_and_land_boundaries_gdf = eez_and_land_boundaries_gdf.append(empty_row_hk)
eez_and_land_boundaries_gdf = eez_and_land_boundaries_gdf.append(empty_row_macao)
# replace with new version of China
eez_and_land_boundaries_gdf.loc[eez_and_land_boundaries_gdf.index=='China','geometry'] = china_new_geom

now create a blob for all boundaries

In [8]:
# create one blob for all world land and EEZ boundaries, using Shapely function cascaded_union 
# whatever is left out is, presumably, international waters
# this is used below to determine whether any parts of pipelines are in international waters

if 'world_eez_and_land_boundaries_gdf' not in locals(): # only do this if it hasn't been done already
    world_eez_and_land_boundaries_gdf = shapely.ops.cascaded_union(eez_and_land_boundaries_gdf['geometry'])

#check type, should be multipolygon object
type(world_eez_and_land_boundaries_gdf)

  world_eez_and_land_boundaries_gdf = shapely.ops.cascaded_union(eez_and_land_boundaries_gdf['geometry'])


shapely.geometry.multipolygon.MultiPolygon

# Import and clean data

In [9]:
# create a copy of the working/finalized pipelines dataset into the below folder as an Excel file
#path = '/content/drive/Shareddrives/GEM Shared Drive/Projects/Fossil Infrastructure (GFIT)/Pipelines/Pipeline Data Analysis/EEZ_land_union_v2_201410/'
#pipe = pd.read_excel(path + 'pipe_082021.xlsx')

gc = pygsheets.authorize(service_account_env_var='GDRIVE_API_CREDENTIALS')
spreadsheet = gc.open_by_key('1foPLE6K-uqFlaYgLPAUxzeXfDO5wOOqE7tibNHeqTek')

gas_pipes = spreadsheet.worksheet('title', 'Gas pipelines').get_as_df()
oil_pipes = spreadsheet.worksheet('title', 'Oil/NGL pipelines').get_as_df()

#gas_pipes.drop('WKTFormat', axis=1, inplace=True) # delete WKTFormat column
#oil_pipes.drop('WKTFormat', axis=1, inplace=True)

# delete columns that aren't the same in the sheets, to concatenate them...
columns_not_in_oil = list(set(gas_pipes.columns)-set(oil_pipes.columns))
columns_not_in_gas = list(set(oil_pipes.columns)-set(gas_pipes.columns))
gas_pipes.drop(columns=columns_not_in_oil, axis=1, inplace=True)
oil_pipes.drop(columns=columns_not_in_gas, axis=1, inplace=True)

In [10]:
region_df_orig = spreadsheet.worksheet('title', 'Region dictionary').get_as_df()

## replace eez_and_land_boundaries_gdf country names with the ones we use in GFIT, for consistency

In [11]:
rename_eez_df = region_df_orig.copy()
rename_eez_df = rename_eez_df[rename_eez_df['EEZNamesIfDifferent']!='']
rename_eez_dict = dict(zip(rename_eez_df.EEZNamesIfDifferent, rename_eez_df.Country))
eez_and_land_boundaries_gdf.rename(index=rename_eez_dict, inplace=True)

## Specify Oil/NGL or Gas

In [12]:
#type = 'Oil'
#type = 'Gas'
type = 'Oil_and_Gas'

if type=='Oil':
    pipe = oil_pipes
    #pipe.drop(column='CapacityBOEd', inplace=True)
elif type=='Gas':
    pipe = gas_pipes
    #pipe.drop('CapacityBcm/y', inplace=True)
elif type=='Oil_and_Gas':  
    pipe_orig = pandas.concat([oil_pipes, gas_pipes], ignore_index=True)

pipe_orig.replace('--', numpy.nan, inplace=True)

In [13]:
# get pipeline list, import as df
pipe_orig['PipelineName'] = pipe_orig['PipelineName'].str.strip()
pipe_orig['SegmentName'] = pipe_orig['SegmentName'].str.strip()
pipe_orig['ProjectID'] = pipe_orig['ProjectID'].str.strip()

# clean up column 'Route'
pipe_orig['Route'] = pipe_orig['Route'].str.strip()

# get rid of "N/A" and any empty routes (which would be empty rows)
pipe_orig = pipe_orig[pipe_orig['Route']!='N/A']
pipe_orig = pipe_orig[pipe_orig['Route']!='']

missing_route_options = ['Unavailable',
                         'Capacity expansion only',
                         'Bidirectionality upgrade only',
                         'Short route (< 100 km)']

pipes_noroute_df = pipe_orig.copy()[pipe_orig['Route'].isin(missing_route_options)]
pipes_withroute_df = pipe_orig.copy()[~pipe_orig['Route'].isin(missing_route_options)]

# Length Calculation Functions

## convert gfit to linestring

In [14]:
def convert_gfit_to_linestring(coord_str, pipeline_name, segment_name, project_id, status, fuel, length):
    '''
    Takes string from GFIT column of coordinates for a single pipeline,
    converts that string into Shapely LineString or MultiLinestring for processing.
    '''

    #print(pipeline_name, segment_name)
    #print(coord_str)
    if ':' in coord_str and ';' not in coord_str:
        # simple geometry; no branching
        # create nested list of lists, separating on colons        
        coord_list = coord_str.split(':')
        
        coord_list_tuples = []
        
        # non-branched pipeline (nested list with one level)
        # convert nested list of lists to list of tuples
        for element in coord_list:
            element_tuple = (float(element.split(',')[1]), 
                             float(element.split(',')[0]))
            coord_list_tuples.append(element_tuple)
            
        pipeline = shapely.geometry.LineString(coord_list_tuples)

    elif ':' in coord_str and ';' in coord_str:
        # create a nested list of lists, separating on semicolons
        coord_list = coord_str.split(';')
        
        # create a second level of nesting, separating on colons
        coord_list = [x.split(':') for x in coord_list]
        
        # branched pipeline (nested list with two levels)
        pipeline_ls_all = []
        
        for nested_list in coord_list:
            coord_list_tuples = []
            
            for element in nested_list:
                element_tuple = (float(element.split(',')[1]), 
                                 float(element.split(',')[0]))
                coord_list_tuples.append(element_tuple)
                
            # process coord_list_tuples
            try:
                pipeline_ls = shapely.geometry.LineString(coord_list_tuples)
                pipeline_ls_all.append(pipeline_ls)
            except:
                print(f"Exception for coord_list_tuples: {coord_list_tuples}") # for db
                pass
            
        pipeline = shapely.geometry.MultiLineString(pipeline_ls_all)
        
    else:
        # create empty MultiLineString; coordinates were missing or misformatted
        pipeline = shapely.geometry.MultiLineString([])
        
        print(f'Missing or misformatted coordinates for {pipeline_name} - {segment_name}')
        
    return pipeline

## pipeline total length and wiggle

In [15]:
def pipeline_total_length_and_wiggle(pipes_df):
    '''
    Iterate through each pipeline, calculating the total length and wiggle factor.
    
    Modifies the main df that was function argument, returning modified version.
    '''
    
    mask_route_1 = pipes_df['Route'].str.contains(',')
    mask_route_2 = pipes_df['Route'].str.contains(':')
    pipes_with_route = pipes_df.loc[(mask_route_1) & (mask_route_2)]
    
    for row in pipes_with_route.index:
        # get string with coordinates for route, convert to LineString (or MultiLineString)
        pipeline_name = pipes_with_route.at[row, 'PipelineName']
        segment_name = pipes_with_route.at[row, 'SegmentName']
        project_id = pipes_with_route.at[row, 'ProjectID']
        pipeline_str = pipes_with_route.at[row, 'Route']
        status = pipes_with_route.at[row, 'Status']
        fuel = pipes_with_route.at[row, 'Fuel']
        length = pipes_with_route.at[row, 'LengthMergedKm']
        pipeline_ls = convert_gfit_to_linestring(pipeline_str, pipeline_name, segment_name, 
                                                 project_id, status, fuel, length)

        # calculate length of LineString (or MultiLineString)
        geodetic_computation = pyproj.Geod(ellps="WGS84")
        length_calc = geodetic_computation.geometry_length(pipeline_ls)/1000 # units km

        # get reported length of pipeline
        length_report = pipes_with_route.at[row, 'LengthKnownKm']

        #print(pipeline_name, segment_name)
        
    #    if pandas.notnull(length_report):
    #        # calculate wiggle factor regardless of relationship,
    #        # whether length_report is > or < length_calc
    #        # if reported and calculated length both exist, calculate their ratio:
        try:
            wiggle_factor = length_report / length_calc
            pipes_df.at[row, 'WiggleFactor'] = wiggle_factor
        # if one doesn't exist, you get a TypeError when dividing; replace with
        except TypeError:
            print(pipeline_name, segment_name, project_id)
            print(length_report, length_calc)
            print('TypeError, WiggleFactor set to 1.0')
            pipes_df.at[row, 'WiggleFactor'] = float(1)

    #    else:
    #        print('notnull)')
    #        # there was no reported length; assign wiggle_factor = 1.0
    #        pipes_df.at[row, 'WiggleFactor'] = float(1)

    return(pipes_df)

## pipeline within country

In [16]:
eez_and_land_boundaries_gdf[eez_and_land_boundaries_gdf.index=='Hong Kong']

Unnamed: 0,OBJECTID,ISO_3digit,Changes,Shape_Leng,Shape_Area,geometry
Hong Kong,,,,,,"MULTIPOLYGON (((114.22983 22.55581, 114.23471 ..."


In [17]:
def pipeline_within_country(pipeline_ls, 
                            pipeline_name, 
                            segment_name, 
                            project_id, 
                            results_by_country, 
                            status, 
                            fuel, 
                            length, 
                            remainders, 
                            international):
    '''
    Iterate through all countries, to see if the specified pipeline 
    is within each country (at least partially).
    
    If there is a portion within a given country, 
    saves the country name and length of pipeline to a df
    '''
    
    pipeline_remainders = pipeline_ls # initialize
    # will progressively remove pieces of the pipeline, 
    # as they intersect with each country's land mass
    
    geodetic_computation = pyproj.Geod(ellps="WGS84") # initialize
    
    length_total = geodetic_computation.geometry_length(pipeline_ls)/1000 # units km
    
    for country in eez_and_land_boundaries_gdf.index:
        
        country_geom = eez_and_land_boundaries_gdf.loc[country, 'geometry']
        
        if country_geom.intersects(pipeline_ls)==True:
            pipeline_intersection = pipeline_ls.intersection(country_geom)
            pipeline_remainders = pipeline_remainders.difference(country_geom)
            
            length_per_country = geodetic_computation.geometry_length(pipeline_intersection)/1000 # units km
            length_per_country_fract = length_per_country / length_total
            
            one_result = (
                pipeline_name, 
                segment_name,
                project_id,
                country, 
                length_per_country, 
                length_per_country_fract,
                status,
                fuel,
                length)
            one_result_df = pandas.DataFrame(one_result).T
            one_result_df.columns = ['pipeline_name', 'segment_name', 'project_id', 'country', 
                                     'length_per_country', 'length_per_country_fract', 
                                     'status', 'fuel', 'length']
            
            results_by_country = results_by_country.append(
                pandas.DataFrame(one_result_df), 
                sort=False)
            
        else:
            pass

    results_by_country = results_by_country.reset_index(drop=True)
        
    if pipeline_remainders.is_empty==False:       
        remainders_length = geodetic_computation.geometry_length(pipeline_remainders)/1000 # units km
        
        if remainders_length > 0.01: # units: km
            remainders_tuple = (pipeline_name, remainders_length, pipeline_remainders)
            remainders = pandas.DataFrame(remainders_tuple).T
            remainders.columns = ['pipeline_name', 'segment_name', 'project_id', 'length', 'geometry']
        
            print(f"for {pipeline_name}, pipeline_remainders.is_empty==False") # for db
            print(f"remainders_length: {remainders_length}") # for db
        
    else:
#         print(f"for {pipeline_name}, pipeline_remainders.is_empty is NOT False") # for db
        pass
    
    # alternative method: pipeline that's in international waters (not in world_eez_and_land_boundaries_gdf)
    international_pipeline = pipeline_ls.difference(world_eez_and_land_boundaries_gdf)
    if international_pipeline.is_empty==False:
        international_length = geodetic_computation.geometry_length(international_pipeline)/1000 # units km
        international_tuple = (pipeline_name, international_length, international_pipeline)
        international = pandas.DataFrame(international_tuple).T
        international.columns = ['pipeline_name', 'length', 'geometry']
    else:
        pass
    
    return results_by_country, remainders, international

# Apply functions to data

In [18]:
#this step requires that there be no non-geometry data in the "Route" column, meaning no letters or extraneous symbols (ex: ";;", "::", "--", etc.)
pipe_gpd = geopandas.GeoDataFrame(pipes_withroute_df)
pipe_gpd['geometry'] = ''
for row in pipe_gpd.index:
    linestring = convert_gfit_to_linestring(
        str(pipe_gpd.at[row, 'Route']), 
        pipe_gpd.at[row, 'PipelineName'], 
        pipe_gpd.at[row, 'SegmentName'], 
        pipe_gpd.at[row, 'ProjectID'], 
        pipe_gpd.at[row, 'Status'], 
        pipe_gpd.at[row, 'Fuel'], 
        pipe_gpd.at[row, 'LengthMergedKm'])
    pipe_gpd.at[row, 'geometry'] = linestring



Exception for coord_list_tuples: [(119.608599, 39.935148)]


In [19]:
# calculate total length and wiggle factor
pipes_withroute_df = pipeline_total_length_and_wiggle(pipes_withroute_df)

Exception for coord_list_tuples: [(119.608599, 39.935148)]


In [20]:
# calculate length by country
# get coord_str for each pipeline that has route coordinates
# choose pipes_withroute_df to process
#pipeline_df = pipe.copy()

results_by_country = pandas.DataFrame(
    columns=['pipeline_name', 'segment_name', 'project_id', 'country', 'length_per_country', 'status', 'fuel', 'length']
)

remainders = geopandas.GeoDataFrame()
international = geopandas.GeoDataFrame()

for sel_index in pipes_withroute_df.index:
    pipeline_name = pipes_withroute_df.at[sel_index, 'PipelineName']
    segment_name = pipes_withroute_df.at[sel_index, 'SegmentName']
    project_id = pipes_withroute_df.at[sel_index, 'ProjectID']
    
    #print(pipeline_name, segment_name, project_id)
    
    status = pipes_withroute_df.at[sel_index, 'Status']
    fuel = pipes_withroute_df.at[sel_index, 'Fuel']
    length = pipes_withroute_df.at[sel_index, 'LengthMergedKm']
    coord_str = str(pipes_withroute_df.at[sel_index, 'Route'])
    
    pipeline_ls = convert_gfit_to_linestring(coord_str, 
                                             pipeline_name, 
                                             segment_name, 
                                             project_id, 
                                             status, 
                                             fuel, 
                                             length)
    
    results_by_country, remainders, international = pipeline_within_country(
        pipeline_ls, 
        pipeline_name, 
        segment_name, 
        project_id, 
        results_by_country, 
        status, 
        fuel, 
        length, 
        remainders, 
        international)

Exception for coord_list_tuples: [(119.608599, 39.935148)]


# Now go through all pipelines that DON'T have a route and fill in missing length info

In [21]:
for sel_index in pipes_noroute_df.index:
    
    # skip if the length is a nan value
    if pandas.isnull(pipes_noroute_df.at[sel_index, 'LengthMergedKm']):
        pass
        #continue
    
    # otherwise, get information
    else:
        pipeline_name = pipes_noroute_df.at[sel_index, 'PipelineName']
        segment_name = pipes_noroute_df.at[sel_index, 'SegmentName']
        project_id = pipes_noroute_df.at[sel_index, 'ProjectID']
        status = pipes_noroute_df.at[sel_index, 'Status']
        fuel = pipes_noroute_df.at[sel_index, 'Fuel']
        length = pipes_noroute_df.at[sel_index, 'LengthMergedKm']
        coord_str = str(pipes_noroute_df.at[sel_index, 'Route'])
    
        #print(length)
        #then ask how many countries it passes through
        ncountries = pipes_noroute_df.at[sel_index, 'NumberOfCountries']
        
        if ncountries>1:
            #print('more than 1 country')
            #print(length)
            
            country_list = pipes_noroute_df.at[sel_index, 'Countries'].split(',')
            country_list = [i.strip() for i in country_list]
            
            for country in country_list: 
                length_per_country = length/country_list.__len__()
                length_per_country_fract = 1/country_list.__len__()

                one_result = (
                    pipeline_name, 
                    segment_name,
                    project_id,
                    country, 
                    length_per_country, 
                    length_per_country_fract,
                    status,
                    fuel,
                    length)
                one_result_df = pandas.DataFrame(one_result).T
                one_result_df.columns = ['pipeline_name', 'segment_name', 'project_id', 'country', 
                                         'length_per_country', 'length_per_country_fract', 
                                         'status', 'fuel', 'length']

                results_by_country = results_by_country.append(
                    pandas.DataFrame(one_result_df), 
                    sort=False)

        else:
            country = pipes_noroute_df.at[sel_index, 'Countries']
            
            length_per_country = length
            length_per_country_fract = 1.0

            one_result = (
                pipeline_name, 
                segment_name,
                project_id,
                country, 
                length_per_country, 
                length_per_country_fract,
                status,
                fuel,
                length)
            one_result_df = pandas.DataFrame(one_result).T
            one_result_df.columns = ['pipeline_name', 'segment_name', 'project_id', 'country', 
                                     'length_per_country', 'length_per_country_fract', 
                                     'status', 'fuel', 'length']

            results_by_country = results_by_country.append(
                pandas.DataFrame(one_result_df), 
                sort=False)

In [22]:
results_by_country

Unnamed: 0,pipeline_name,segment_name,project_id,country,length_per_country,status,fuel,length,length_per_country_fract
0,Alberta Clipper Oil Pipeline,,P0001,Canada,1099.524365,Operating,Oil,1790.0,0.703393
1,Alberta Clipper Oil Pipeline,,P0001,United States,463.648701,Operating,Oil,1790.0,0.296608
2,Athabasca Oil Pipeline,,P0002,Canada,418.095904,Operating,Oil,542.35,1.0
3,Bakken Expansion Pipeline,,P0004,Canada,155.333654,Operating,Oil,260.71,0.592423
4,Bakken Expansion Pipeline,,P0004,United States,106.866868,Operating,Oil,260.71,0.407577
...,...,...,...,...,...,...,...,...,...
0,Bulgaria-Serbia Interconnector Gas Pipeline,Capacity expansion,P3725,Serbia,0.0,Construction,Gas,0.0,0.5
0,Wilhelmshaven LNG Terminal Pipeline,,P3853,Germany,30.0,Proposed,Gas,30.0,1.0
0,Batangas LNG Terminal Pipeline,,P3854,Philippines,1.3,Construction,Gas,1.3,1.0
0,Mozambique LNG Gas Pipeline,,P3855,Mozambique,45.0,Cancelled,Gas,45.0,1.0


# Check Mean Wiggle Factor, & Outliers

In [23]:
calc_length = pipes_withroute_df['LengthKnownKm'] / pipes_withroute_df['WiggleFactor']
wiggle_factor_weighted_mean = pipes_withroute_df['LengthKnownKm'].sum()/calc_length.sum()

In [24]:
pipes_withroute_df.loc[pipes_withroute_df['WiggleFactor']<0.95][['PipelineName', 'SegmentName', 'ProjectID', 'WiggleFactor', 'LengthKnownKm']]

Unnamed: 0,PipelineName,SegmentName,ProjectID,WiggleFactor,LengthKnownKm
8,Enbridge Line 1 Oil Pipeline,,P0010,0.929848,1654.41
15,Enbridge Line 7 Oil Pipeline,,P0017,0.775033,193.0
22,Keystone Oil Pipeline,,P0024,0.830736,3461.7
24,Lloydminster-Hardisty Oil Pipeline,,P0026,0.714422,80.0
30,Trans Mountain Oil Pipeline,Expansion,P0033,0.893043,980.0
...,...,...,...,...,...
3365,North Bakken Expansion Pipeline,Elkhorn Creek–Northern Border Pipeline,P3588,0.613073,0.48
3406,Moomba Sydney Pipeline System,Young to Lithgow and Bathurst Pipeline,P3635,0.926658,245.0
3416,Berri to Mildura Pipeline,,P3645,0.926649,148.0
3420,Eastern Goldfields Pipeline System,Murrin Murrin Lateral,P3650,0.694013,85.0


# Clean and export results

In [25]:
# export length estimates by country and pipeline
results_by_country.rename(columns={'length_per_country':'LengthEstimateKm',
                                    'pipeline_name':'PipelineName',
                                    'segment_name':'SegmentName',
                                    'project_id':'ProjectID',
                                    'length_per_country':'LengthEstimateKmByCountry',
                                    'country':'Country',
                                    'status':'Status',
                                    'fuel':'Fuel',
                                    'length':'LengthEstimateKm',
                                    'length_per_country_fract':'LengthPerCountryFraction'}, inplace=True)

results_by_country.sort_values('ProjectID', inplace=True)
# results_by_country['Country'].replace('United States', 'USA', inplace=True)
# results_by_country['Country'].replace('Czech Republic', 'Czechia', inplace=True)
# results_by_country['Country'].replace('Swaziland', 'Eswatini', inplace=True)
# results_by_country['Country'].replace('Congo', 'Republic of Congo', inplace=True)
# results_by_country['Country'].replace('Congo, DRC', 'DR Congo', inplace=True)
# results_by_country['Country'].replace('Sudan', 'Republic of Sudan', inplace=True)
# results_by_country['Country'].replace('Swaziland', 'Eswatini', inplace=True)
# results_by_country['Country'].replace('Guinea', 'Republic of Guinea', inplace=True)
# results_by_country['Country'].replace('Bosnia & Herzegovina', 'Bosnia and Herzegovina', inplace=True)
# results_by_country['Country'].replace('Trinidad & Tobago', 'Trinidad and Tobago', inplace=True)

results_by_country.to_excel('Estimated_Length-Results_By_Country_'+type+'.xlsx')

In [26]:
results_by_pipeline = results_by_country.copy()[['ProjectID','LengthEstimateKmByCountry']]

#results_by_pipeline = pandas.DataFrame(results_by_pipeline.groupby(by=['ProjectID'])['LengthEstimateKm'].sum())
results_by_pipeline=results_by_country.groupby("ProjectID")["LengthEstimateKmByCountry"].sum()

results_by_pipeline.to_excel('Estimated_Length-Results_By_Pipeline_'+type+'.xlsx')

# troubleshooting length mismatches

the full sum of km in the dataset doesn't equal the output from finding country ratios of pipelines

### make sure the EEZ countries are synced up with our countries

In [None]:
eez_countries = eez_and_land_boundaries_gdf.index.tolist()

In [None]:
ggit_countries = region_df_orig['Country'].tolist()

### check which ProjectIDs are missing, if any

In [None]:
new_pids = list(set(results_by_country[results_by_country['Fuel']=='Gas']['ProjectID']))

In [None]:
old_pids = list(set(pipe_orig[pipe_orig['Fuel']=='Gas']['ProjectID']))

In [None]:
these_pipes = list(set(old_pids)-set(new_pids))

In [None]:
pipe_orig[pipe_orig['ProjectID'].isin(these_pipes)]['LengthMergedKm'].sum()

## check which ProjectIDs are missing

In [None]:
pipe_noroute = pipe[pipe['Route'].isin(['Unavailable',
                                       'Capacity expansion only',
                                       'Bidirectionality upgrade only',
                                       'Short route (< 100 km)'])]

In [None]:
pipe_remove_capexp_only = pipe[~pipe['Route'].isin(['Capacity expansion only',
                                       'Bidirectionality upgrade only'])]

### count the number of pipelines in the database that don't have a route BUT we have length info on; also identify how many are missing

In [None]:
pipe_remove_capexp_only