## Pipelines QC code

In [1]:
import pandas as pd
import geopandas as gpd
from geopandas.tools import overlay

from shapely.geometry import Point, LineString, MultiLineString, Polygon
from shapely.ops import cascaded_union
from shapely import wkt

In [3]:
#gem_path = '/Users/masoninman/Dropbox/GEM/'
#gem_path = '/Users/baird/Dropbox/'

pipelines_path = gem_path + 'GFIT (Global Fossil Infrastructure Tracker)/GFIT Pipelines Current - versions saved/'
# pipelines_file = 'GFIT Pipelines_Current (dl 2021-07-22_1452).xlsx'
# pipelines_file = 'GFIT Pipelines_Current (dl 2021-08-09_1550).xlsx'
# pipelines_file = 'GFIT Pipelines_Current (dl 2021-08-12_1709).xlsx'
# pipelines_file = 'GFIT Pipelines_Current (dl 2021-09-08_1118).xlsx'
# pipelines_file = 'GFIT Pipelines_Current (dl 2021-09-17_0513).xlsx'
pipelines_file = 'GFIT Pipelines_Current (dl 2021-09-20_1136).xlsx'

# eez_path = 'EEZ_land_union_v2_201410/'
# eez_file = 'EEZ_land_v2_201410.shp'

# from https://www.marineregions.org/downloads.php
# In the section "Marine and land zones: the union of world country boundaries and EEZ's"
#eez_path = gem_path + 'EEZ_land_union_v3_202003/'
eez_path = 'EEZ_land_union_v3_202003/'
eez_file = 'EEZ_Land_v3_202030.shp'

In [6]:
pipelines_xl = pd.ExcelFile(pipelines_file)

In [7]:
status_in_dev = ['proposed', 'construction', 'shelved']

In [8]:
gas_pipes = pd.read_excel(pipelines_xl, sheet_name='Gas Pipelines')
gas_pipes = gas_pipes.dropna(subset=['ProjectID', 'Pipeline name'], how='any')

# clean up
gas_pipes = gas_pipes.rename(columns={
    'SegmentName': 'Segment name',
})
gas_pipes['Segment name'] = gas_pipes['Segment name'].fillna('')

gas_pipes = gas_pipes.drop('WKTFormat', axis=1)

In [9]:
no_route_options = [
    'Unavailable', 
    'Capacity expansion only', 
    'Bidirectionality upgrade only'
]

In [10]:
# filter for in-development only
gas_pipes = gas_pipes[gas_pipes['Status'].str.lower().isin(status_in_dev)]

In [11]:
oil_pipes = pd.read_excel(pipelines_xl, sheet_name='OilNGL Pipelines')
oil_pipes = oil_pipes.dropna(subset=['ProjectID'])

# exclude oil rows with "don't add" for ProjectID
oil_pipes = oil_pipes[oil_pipes['ProjectID']!="don't add"]

### check IDs
Does each ProjectID correspond to one combination of "Pipeline name" + "Segment name"?

In [12]:
def test_project_ids_unique(df):
    id_counts = df.groupby(['ProjectID'])['ProjectID'].count()
    id_count_multi = id_counts[id_counts > 1]
    if len(id_count_multi)==0:
        print("Test passed!")
    else:
        print(f"Test failed! Not all ProjectIDs were unique; there were {len(id_count_multi)} problems.")
        print(id_count_multi)

In [13]:
def test_each_projectid_matches_one_pipeline_segment_name(df):
    df['Combo name'] = (df['Pipeline name'] + ' ' + df['Segment name'].astype(str)).str.strip()

    df = df[['ProjectID', 'Combo name']]
    counts = df.groupby('ProjectID')['Combo name'].count()
    multiname = counts[counts > 1]

    if len(multiname)==0:
        print("Test passed!")
    else:
        print("Error!" + f" Problem with IDs; some ProjectID values matched more than one Pipeline name + Segment name:")
        print(multiname)

In [14]:
test_project_ids_unique(gas_pipes)
test_each_projectid_matches_one_pipeline_segment_name(gas_pipes)

Test passed!
Test passed!


## check names
* Is each Pipeline name + Segment name a unique combination?
* Does each pipeline name correspond to only one wiki page?
* Does each wiki page corresond to only one pipeline name?

In [15]:
def test_names_unique(df):
    # are the names unique? (Combination of 'Pipeline name' & 'Segment name')
    df['Combo name'] = (df['Pipeline name'] + ' ' + df['Segment name'].astype(str)).str.strip()
    id_counts = df.groupby(['Combo name'])['Combo name'].count()
    id_count_multi = id_counts[id_counts > 1]
    
    if len(id_count_multi)==0:
        print("Test passed! All names unique.")
    else:
        print("Test failed!" + f" There were names repeated:")
        keep_cols = ['Countries', 'Pipeline name', 'Segment name', 'ProjectID']
        print(df[df['Combo name'].isin(id_count_multi.index)][keep_cols].sort_values(by=keep_cols))

In [16]:
def check_per_wiki_only_one_pipeline_name(df):
    df['Wiki'] = df['Wiki'].str.split('#').str[0]
    df = df[['Pipeline name', 'Wiki']].drop_duplicates()
    
    wiki_counts = df.groupby(['Wiki'])[['Wiki']].count()
    wiki_count_multi = wiki_counts[wiki_counts['Wiki'] > 1]
    df2 = df[df['Wiki'].isin(wiki_count_multi.index)]
    
    if len(df2)==0:
        print("Test passed!")
    else:
        print(f"Test check_per_wiki_only_one_pipeline_name failed!")
        print()
        for wiki in df2['Wiki'].unique().tolist():
            print(f"For wiki {wiki} there were the following pipeline names:")
            print(df2[df2['Wiki']==wiki]['Pipeline name'].tolist())
            print()

In [17]:
def check_per_pipeline_name_only_one_wiki(df):
    df['Wiki'] = df['Wiki'].str.split('#').str[0]
    df = df[['Pipeline name', 'Wiki']].drop_duplicates()
    
    name_counts = df.groupby(['Pipeline name'])[['Pipeline name']].count()
    name_count_multi = name_counts[name_counts['Pipeline name'] > 1]
    df2 = df[df['Pipeline name'].isin(name_count_multi.index)]
    
    if len(df2)==0:
        print("Test passed!")
    else:
        print(f"Test check_per_pipeline_name_only_one_wiki failed!")
        print()
        for name in df2['Pipeline name'].unique().tolist():
            print(f"For pipeline name {name} there were the following wiki URLs:")
            print(df2[df2['Pipeline name']==name]['Wiki'].tolist())
            print()

In [18]:
# TO DO: Find cases in which pipeline name doesn't match wiki URL: To do comparison, need to handle unicode characters in URLs, turning them into regular characters. (Or, turning the pipeline name characters into unicode.)

In [19]:
test_names_unique(gas_pipes)

Test failed! There were names repeated:
     Countries                         Pipeline name  \
223      China            Anhui Gas Pipeline Network   
94       China            Anhui Gas Pipeline Network   
896      China            Anhui Gas Pipeline Network   
80       China            Anhui Gas Pipeline Network   
2153     China  Jiangxi Natural Gas Pipeline Network   
2154     China  Jiangxi Natural Gas Pipeline Network   

                                           Segment name ProjectID  
223             Changfeng–Jinzhai Gas Transmission Line     P3241  
94              Changfeng–Jinzhai Gas Transmission Line     P3247  
896   Northeast Anhui Natural Gas Pipeline Phase II ...     P3240  
80    Northeast Anhui Natural Gas Pipeline Phase II ...     P3246  
2153                       Phase I Yifeng-Tonggu Branch     P3322  
2154                       Phase I Yifeng-Tonggu Branch     P3323  


In [21]:
check_per_wiki_only_one_pipeline_name(gas_pipes)

Test check_per_wiki_only_one_pipeline_name failed!

For wiki https://www.gem.wiki/Florida_Gas_Transmission_Pipeline there were the following pipeline names:
['Florida Gas Transmission Pipeline', 'Western Division Expansion Project']

For wiki https://www.gem.wiki/Sichuan%E2%80%93Shanghai_Parallel_Gas_Pipeline there were the following pipeline names:
['Sichuan–Shanghai Parallel Gas Pipeline', 'Sichuan-Shanghai Parallel Gas Pipeline']



In [22]:
check_per_pipeline_name_only_one_wiki(gas_pipes)

Test check_per_pipeline_name_only_one_wiki failed!

For pipeline name Columbia Gas Transmission there were the following wiki URLs:
['https://www.gem.wiki/Columbia_Gulf_Transmission', 'https://www.gem.wiki/Columbia_Gas_Transmission']

For pipeline name Hungary-Slovenia-Italy Interconnector Gas Pipeline there were the following wiki URLs:
['https://www.gem.wiki/Hungary_Slovenia_Italy_Interconnection', 'https://www.gem.wiki/Hungary-Slovenia-Italy_Interconnector_Gas_Pipeline']



## check owners

In [23]:
# Check ProjectIDs in "Pipelines" sheet vs "Owners" sheet
# Any that are in one sheet and not the other?
owners = pd.read_excel(pipelines_xl, sheet_name='Owners')
owners = owners.dropna(subset=['ProjectID'])

In [24]:
gas_pipes_projectids = gas_pipes['ProjectID'].tolist()
oil_pipes_projectids = oil_pipes['ProjectID'].tolist()
all_pipes_projectids = gas_pipes_projectids + oil_pipes_projectids
owners_projectids = owners['ProjectID'].tolist()

In [25]:
# oil_pipes[oil_pipes['ProjectID'].astype(str)=="don't add"]

In [26]:
in_pipelines_not_owners = [x for x in all_pipes_projectids if x not in owners_projectids]
print(f"There were {len(in_pipelines_not_owners)} PipelineIDs in main sheets (gas & oil) that were not in Owners sheet:")
print(in_pipelines_not_owners)

There were 0 PipelineIDs in main sheets (gas & oil) that were not in Owners sheet:
[]


In [27]:
# in_owners_not_pipelines = [x for x in owners_projectids if x not in all_pipes_projectids]
# print(f"There were {len(in_owners_not_pipelines)} PipelineIDs in Owners sheet that were not in main sheets (gas & oil):")
# print(in_owners_not_pipelines)

## check countries
* Check entries in "Countries” column: Does it contain both the “StartCountry” and “EndCountry” entries?

In [28]:
def check_that_start_end_countries_in_countries_column(df):   
    missing_country = df[(df['StartCountry'].isna()) | (df['EndCountry'].isna())]
    if len(missing_country)>0:
        print("Error! There were missing entries for Start/End Country:")
        print(missing_country[['ProjectID', 'Countries', 'StartCountry', 'EndCountry']])
        print()
    
    for row in df.index:
        try:
            countries_list = df.at[row, 'Countries'].split(',')
            countries_list = [x.strip() for x in countries_list]
            start_country = df.at[row, 'StartCountry']
            end_country = df.at[row, 'EndCountry']
            project_id = df.at[row, 'ProjectID']
            if pd.isna(start_country)==False and start_country not in countries_list:
                print(f"Error! For {project_id}, StartCountry {start_country} not in countries list: {countries_list}")
            if pd.isna(end_country)==False and end_country not in countries_list:
                print(f"Error! For {project_id}, EndCountry {end_country} not in countries list: {countries_list}")
        except:
            print(f"Hit exception: df.at[row, 'Countries']: {df.at[row, 'Countries']}")

In [29]:
check_that_start_end_countries_in_countries_column(gas_pipes)

Error! There were missing entries for Start/End Country:
     ProjectID Countries StartCountry EndCountry
79       P3245     China          NaN        NaN
84       P3234     China          NaN      China
85       P3233     China          NaN      China
86       P3239     China          NaN      China
89       P3238     China          NaN      China
90       P3237     China          NaN      China
94       P3247     China          NaN        NaN
95       P3235     China          NaN      China
96       P3236     China          NaN      China
97       P3228     China          NaN        NaN
2144     P3313     China          NaN        NaN
2193     P3362  Cambodia          NaN        NaN

Error! For P3184, StartCountry Canada not in countries list: ['USA']
Error! For P3185, StartCountry Canada not in countries list: ['USA']
Error! For P3241, StartCountry Bangladesh not in countries list: ['China']
Error! For P3241, EndCountry Bangladesh not in countries list: ['China']
Error! For P3240, S

In [30]:
def same_state_province_as_country(df):
    """
    Look for those with state same as country.
    """
    same_start = df[df['StartState/Province']==df['StartCountry']]
    if len(same_start)>0:
        print("Some had the same start state/province as country; may be incorrect")
        print(same_start[['ProjectID', 'StartState/Province', 'StartCountry']].sort_values(by=['StartCountry', 'StartState/Province']))
        print()

    same_end = df[df['EndState/Province']==df['EndCountry']]
    if len(same_end)>0:
        print("Some had the same end state/province as country; may be incorrect")
        print(same_end[['ProjectID', 'EndState/Province', 'EndCountry']].sort_values(by=['EndCountry', 'EndState/Province']))
        print()

In [31]:
same_state_province_as_country(gas_pipes)

In [32]:
# look for outliers in location details
def location_outliers(df_arg):
    print("Check start state/province")
    df = df_arg.copy()
    # exclude those with same state/province as country
    df = df[df['StartState/Province']!=df['StartCountry']]
    df['StartState/Province'] = df['StartState/Province'].fillna('')
    df['Start_State_Country'] = df['StartState/Province'] + ', ' + df['StartCountry']
    df['Start_State_Country'] = df['Start_State_Country'].str.strip(', ')

    counts = df['Start_State_Country'].value_counts()
    singletons = counts[counts==1]
    sel_df = df[df['Start_State_Country'].isin(singletons.index)]
    for country in sel_df['StartCountry'].sort_values().unique().tolist():
        singleton_states = sel_df[sel_df['StartCountry']==country]['StartState/Province'].tolist()
        singleton_states = [x for x in singleton_states if x != '']
        singleton_states = list(set(singleton_states))
        singleton_states.sort()
        if len(singleton_states)>0:
            print(f"For country {country}, singleton states: {singleton_states}")
            print()
    
    print("------------------------")
    print("Check end state/province")
    
    df = df_arg.copy()
    # exclude those with same state/province as country
    df = df[df['EndState/Province']!=df['EndCountry']]
    df['EndState/Province'] = df['EndState/Province'].fillna('')
    df['End_State_Country'] = df['EndState/Province'] + ', ' + df['EndCountry']
    df['End_State_Country'] = df['End_State_Country'].str.strip(', ')
    
    counts = df['End_State_Country'].value_counts()
    singletons = counts[counts==1]
    sel_df = df[df['End_State_Country'].isin(singletons.index)]
    for country in sel_df['EndCountry'].sort_values().unique().tolist():
        singleton_states = sel_df[sel_df['EndCountry']==country]['EndState/Province'].tolist()
        singleton_states = [x for x in singleton_states if x != '']
        singleton_states = list(set(singleton_states))
        singleton_states.sort()
        if len(singleton_states)>0:
            print(f"For country {country}, singleton states: {singleton_states}")
            print()

In [33]:
location_outliers(gas_pipes)

Check start state/province
For country Albania, singleton states: ['Fier']

For country Algeria, singleton states: ['Laghouat', 'Tlemcen']

For country Argentina, singleton states: ['Buenos Aires', 'Formosa', 'Jujuy', 'Santa Cruz']

For country Australia, singleton states: ['Northern Territory']

For country Bolivia, singleton states: ['Santa Cruz']

For country Bosnia and Herzegovina, singleton states: ['Dalmatia']

For country Brazil, singleton states: ['São Paulo']

For country Canada, singleton states: ['Nova Scotia']

For country Chile, singleton states: ['Atacama']

For country China, singleton states: ['Heilongjiang', 'Jiangsu', 'Liaoning', 'Qinghai', 'Zhejiang']

For country Colombia, singleton states: ['Bolivar', 'Córdoba', 'Valle del Cauca']

For country Djibouti, singleton states: ['Somali']

For country Egypt, singleton states: ['Port Said']

For country Ghana, singleton states: ['Western']

For country Greece, singleton states: ['Eastern Macedonia and Thrace']

For country

In [34]:
# can also look like this to find typos and other small differences
sel_country = 'Russia'
gas_pipes[gas_pipes['StartCountry']==sel_country]['StartState/Province'].value_counts().sort_index()

Altai Krai                        3
Irkutsk Oblast                    1
Khabarovsk Krai                   1
Komi                              1
Leningrad Oblast                  1
Murmansk Oblast                   1
Nizhgorod Oblast                  1
Republic of Komi                  1
Republic of Sakha (Yakutia)       1
Respublic of Sakha (Yakutia)      1
Vologda Oblast                    1
Yamalo-Nenets Autonomous Okrug    2
Yamalo-Nenets Autonomus Okrug     1
Name: StartState/Province, dtype: int64

## check routes
* Check routes vs StartCountry & EndCountry

In [35]:
df = gas_pipes.copy()
df = df[~df['Route'].isin(no_route_options)]
df = df.dropna(subset=['Route'])
route_start = df['Route'].str.split(':').str[-1]
route_end = df['Route'].str.split(':').str[-1]

In [36]:
for row in df.index:
    start_country = df.at[row, 'StartCountry']
    end_country = df.at[row, 'EndCountry']

In [37]:
def read_eez_file(eez_path, eez_file):
    # use boundaries from MarineRegions.com
    # union of world country boundaries and Exclusive Economic Zones (2014)
    # http://www.marineregions.org/downloads.php#unioneezcountry
    # Note: Kosovo is part of Serbia in EEZ file

    df = gpd.read_file(eez_path + eez_file)
    df = df.rename(columns={'UNION': 'Country'})
    df['Country'] = df['Country'].replace({
        'Trinidad & Tobago': 'Trinidad and Tobago',
        'Bosnia & Herzegovina': 'Bosnia and Herzegovina',
        'Czech Republic': 'Czechia', # to match GFIT usage
        'Macedonia': 'North Macedonia', # to match GFIT usage
        'United States': 'USA',
        'East Timor': 'Timor-Leste',
        'Ivory Coast': "Cote d'Ivoire",
    })
    df = df.set_index('Country')

#     # create one blob for all world land and EEZ boundaries, using Shapely function cascaded_union 
#     # whatever is left out is, presumably, international waters
#     # this is used below to determine whether anything is in international waters
#     world_eez_and_land_boundaries = cascaded_union(df['geometry'])
    
    eez_and_land_boundaries = df
    return eez_and_land_boundaries #, world_eez_and_land_boundaries

In [38]:
eez_and_land_boundaries = read_eez_file(eez_path, eez_file)

In [39]:
eez_4087 = eez_and_land_boundaries.to_crs('epsg:4087')

In [40]:
df = gas_pipes.copy()
df = df[~df['Route'].isin(no_route_options)]
df = df.dropna(subset=['Route'])
df = df.reset_index(drop=True)

df['Route start'] = df['Route'].str.split(':').str[0]
df['Route_start_lat'] = df['Route start'].str.split(',').str[0].str.strip().astype(float)
df['Route_start_lon'] = df['Route start'].str.split(',').str[1].str.strip().astype(float)

df['Route end'] = df['Route'].str.split(':').str[-1]
df['Route_end_lat'] = df['Route end'].str.split(',').str[0].str.strip().astype(float)
df['Route_end_lon'] = df['Route end'].str.split(',').str[1].str.strip().astype(float)

gas_pipes_plus = df

route_starts = gpd.GeoDataFrame(
    gas_pipes_plus.set_index('ProjectID'), 
    geometry=gpd.points_from_xy(df.Route_start_lon, df.Route_start_lat)
).set_crs('epsg:4326')

route_starts_4087 = route_starts['geometry'].to_crs('epsg:4087')

route_ends = gpd.GeoDataFrame(
    gas_pipes_plus.set_index('ProjectID'), 
    geometry=gpd.points_from_xy(df.Route_end_lon, df.Route_end_lat)
).set_crs('epsg:4326')

route_ends_4087 = route_ends['geometry'].to_crs('epsg:4087')

In [41]:
# check the country polygons for validity
for country in eez_4087.index:
    eez = eez_4087.at[country, 'geometry']
    if eez.is_valid:
        pass
    else:
        print(f"For {country}, error: eez.is_valid = {eez.is_valid}")
        print(f"Excluded {country} from eez_4087")
        eez_4087 = eez_4087.drop(country)

In [42]:
def check_points_in_which_country(gas_pipes_plus, route_starts_4087, route_ends_4087, eez_4087):
    for row in gas_pipes_plus.index:
        project_id = gas_pipes_plus.at[row, 'ProjectID']
        pipeline_name = gas_pipes_plus.at[row, 'Pipeline name']
        project_id = gas_pipes_plus.at[row, 'ProjectID']
        start_country = gas_pipes_plus.at[row, 'StartCountry']
        end_country = gas_pipes_plus.at[row, 'EndCountry']
        
        start_point_4087 = route_starts_4087.at[project_id]
        end_point_4087 = route_ends_4087.at[project_id]
        for point_type in ['start', 'end']:
            if point_type=='start':
                point_4087 = start_point_4087
            elif point_type=='end':
                point_4087 = end_point_4087
            # process start point first
            # comparing against both start country & end country
            for country in eez_4087.index:
                if country=='Kosovo':
                    # EEZ has it as part of Serbia
                    country_eez_4087 = eez_4087.at['Serbia', 'geometry']
                else:
                    country_eez_4087 = eez_4087.at[country, 'geometry']
                    
                within = False # initialize
                try:
                    within = point_4087.within(country_eez_4087)
                except:
                    pass
                if within==True:
                    if country not in [start_country, end_country]:
                        if country=='Alaska' and 'USA' in [start_country, end_country]:
                            pass
                        else:
                            dist_threshold = 5 # km
                            if start_country=='Kosovo':
                                start_country_eez = eez_4087.at['Serbia', 'geometry']
                            else:
                                start_country_eez = eez_4087.at[start_country, 'geometry']
                            start_dist_away_km = start_country_eez.distance(point_4087)/1000
                            if end_country=='Kosovo':
                                end_country_eez = eez_4087.at['Serbia', 'geometry']
                            else:
                                end_country_eez = eez_4087.at[end_country, 'geometry']
                            end_dist_away_km = end_country_eez.distance(point_4087)/1000
                            min_dist_away_km = min([start_dist_away_km, end_dist_away_km])
                            if min_dist_away_km < dist_threshold:
                                pass
                            else:
                                print(f"For {pipeline_name} ({project_id}):")
                                print(f"Start country (spreadsheet): {start_country}")
                                print(f"End country (spreadsheet): {end_country}")
                                if point_type=='start':
                                    coords = f"{gas_pipes_plus.at[row, 'Route_start_lat']}, {gas_pipes_plus.at[row, 'Route_start_lon']}"
                                elif point_type=='end':
                                    coords = f"{gas_pipes_plus.at[row, 'Route_end_lat']}, {gas_pipes_plus.at[row, 'Route_end_lon']}"
                                print(f"However, found the {point_type} point in EEZ of {country}, at: {coords}; minimum distance: {round(min_dist_away_km,1)}.")
                                route = gas_pipes_plus.at[row, 'Route']
                                if ';' in route:
                                    print("Warning, this pipeline route has separate segments; may be branching or not in order.")
                                print()

In [43]:
check_points_in_which_country(gas_pipes_plus, route_starts_4087, route_ends_4087, eez_4087)

For Anamur to North Cyprus Gas Pipeline (P1324):
Start country (spreadsheet): Turkey
End country (spreadsheet): Turkey
However, found the end point in EEZ of Cyprus, at: 35.356076, 33.100935; minimum distance: 41.0.

For Baltic Pipe Project (P0684):
Start country (spreadsheet): Denmark
End country (spreadsheet): Norway
However, found the start point in EEZ of Poland, at: 53.8, 15.2599; minimum distance: 86.2.

For Hungary-Slovenia-Italy Interconnector Gas Pipeline (P1340):
Start country (spreadsheet): Hungary
End country (spreadsheet): Italy
However, found the start point in EEZ of Slovenia, at: 46.189904, 14.493566; minimum distance: 89.0.

For Hungary-Slovenia-Italy Interconnector Gas Pipeline (P1340):
Start country (spreadsheet): Hungary
End country (spreadsheet): Italy
However, found the end point in EEZ of Slovenia, at: 46.1899043, 14.4935662; minimum distance: 89.0.

For Liza Gas Pipeline (P0418):
Start country (spreadsheet): Guyana
End country (spreadsheet): Guyana
However, foun

KeyError: 'Joint Petroleum Development Area'

In [44]:
def convert_gfit_to_linestring(coord_str, pipeline_name):
    '''
    Takes string from GFIT column of coordinates for a single pipeline,
    converts that string into Shapely LineString or MultiLinestring.
    '''

    if ':' in coord_str and ';' not in coord_str:
        # simple geometry; no branching
        # create nested list of lists, separating on colons        
        coord_list = coord_str.split(':')
        
        coord_list_tuples = []
        
        # non-branched pipeline (nested list with one level)
        # convert nested list of lists to list of tuples
        try:
            for element in coord_list:
                element_tuple = (float(element.split(',')[1]), 
                                 float(element.split(',')[0]))
                coord_list_tuples.append(element_tuple)
        except:
            print(f"Exception for {pipeline_name}; element: {element}") # for db
                
        route_conv = LineString(coord_list_tuples)

    elif ':' in coord_str and ';' in coord_str:
        # create a nested list of lists, separating on semicolons
        coord_list = coord_str.split(';')
        
        # create a second level of nesting, separating on colons
        coord_list = [x.split(':') for x in coord_list]
        
        # branched pipeline (nested list with two levels)
        route_conv_ls_all = []
        
        for nested_list in coord_list:
            coord_list_tuples = []
            
            # process element
            try:
                for element in nested_list:
                    element_tuple = (float(element.split(',')[1]), 
                                     float(element.split(',')[0]))
                    coord_list_tuples.append(element_tuple)
            except:
                print(f"Exception for {pipeline_name}; element: {element}") # for db
                
            # process coord_list_tuples
            try:
                route_conv_ls = LineString(coord_list_tuples)
                route_conv_ls_all.append(route_conv_ls)
            except:
                print(f"Exception for {pipeline_name}; coord_list_tuples: {coord_list_tuples}") # for db
                pass
            
        route_conv = MultiLineString(route_conv_ls_all)
    
    elif coord_str in no_route_options:      
        # create empty MultiLineString; no coordinates
        route_conv = MultiLineString([])
        
        print(f'No coordinates for {pipeline_name}: {coord_str}')
    
    else:
        # create empty MultiLineString; coordinates were missing or misformatted
        route_conv = MultiLineString([])
        
        print(f'Missing or misformatted coordinates for {pipeline_name}')
        
    return route_conv

In [45]:
def convert_all_pipelines(df):
    """
    Apply the conversion function to all pipelines in the dataframe.
    """
    # create geometry column with empty strings
    df['geometry'] = ''
    
    # filter to keep only pipelines with routes
    mask_route_1 = df['Route'].str.contains(',')
    mask_route_2 = df['Route'].str.contains(':')
    pipes_with_route = df.loc[(mask_route_1) & (mask_route_2)]
    
    for row in pipes_with_route.index:
        route_str = df.at[row, 'Route']
        pipeline_name = df.at[row, 'Pipeline name']
        route_str_converted = convert_gfit_to_linestring(route_str, pipeline_name)
        df.at[row, 'geometry'] = route_str_converted
        
    return df

In [46]:
to_convert = gas_pipes.copy()
to_convert = to_convert.dropna(subset=['Route'])
to_convert = to_convert[~to_convert['Route'].isin(no_route_options)]
gas_pipes_wkt = convert_all_pipelines(to_convert)
gas_pipes_wkt = gas_pipes_wkt.reset_index(drop=True)

In [47]:
gas_pipes_wkt_gdf = gpd.GeoDataFrame(gas_pipes_wkt, geometry=gas_pipes_wkt['geometry'])
gas_pipes_wkt_gdf = gas_pipes_wkt_gdf.set_crs('epsg:4326')

In [48]:
gas_pipes_wkt_gdf_4087 = gas_pipes_wkt_gdf.to_crs('epsg:4087')

In [49]:
all_countries = pd.DataFrame() # initialize

gdf_sel = gas_pipes_wkt_gdf_4087[['ProjectID', 'Pipeline name', 'Segment name', 'geometry']]

for sel_country in eez_4087.index:
    one_country_overlay = overlay(
        gdf_sel, 
        eez_4087[eez_4087.index==sel_country][['geometry']], 
        how="intersection")

    one_country_overlay['length km in country'] = one_country_overlay.length / 1000
    one_country_overlay['Country'] = sel_country

    all_countries = all_countries.append(one_country_overlay, sort=False)

In [50]:
gas_pipes[gas_pipes['ProjectID']=='P3226']['Route']

78    NaN
Name: Route, dtype: object

In [51]:
df2 = gas_pipes_wkt.set_index('ProjectID')
df2 = df2.sort_values(by=['StartCountry', 'Pipeline name', 'Segment name'])
for project_id in df2.index:
    pipeline_name = df2.at[project_id, 'Pipeline name']
    segment_name = df2.at[project_id, 'Segment name']
    countries_list = df2.at[project_id, 'Countries'].split(',')
    countries_list = [x.strip() for x in countries_list]
    countries_list.sort()
    
    gis_sel = all_countries.copy()[all_countries['ProjectID']==project_id]
    gis_sel['length km in country'] = gis_sel['length km in country'].astype(int)
    
    # special handling for Alaska:
    gis_sel['Country'] = gis_sel['Country'].replace({
        'Alaska': 'USA',
        'Canary Islands': 'Spain',
    })
    group_cols = ['ProjectID', 'Pipeline name', 'Segment name', 'Country']
    gis_sel = gis_sel.groupby(group_cols)[['length km in country']].sum().reset_index()
    
    # filter out small values
    km_threshold = 5
    gis_sel = gis_sel[gis_sel['length km in country']>km_threshold]
    
    gis_sel_countries = gis_sel['Country'].tolist()
    
    if set(countries_list) != set(gis_sel_countries):
        print(f"For {pipeline_name} {segment_name} ({project_id}), mismatch in countries")
        print(f"List in GFIT: {countries_list}")
        print(f"GIS analysis:")
        print(gis_sel[['Country', 'length km in country']])
        print()

For Albania–Kosovo Gas Pipeline  (P3336), mismatch in countries
List in GFIT: ['Albania', 'Kosovo']
GIS analysis:
   Country  length km in country
0  Albania                   119
1   Serbia                   112

For Ionian Adriatic Gas Pipeline (IAP)  (P0702), mismatch in countries
List in GFIT: ['Albania', 'Bosnia and Herzegovina', 'Croatia']
GIS analysis:
      Country  length km in country
0     Albania                   175
1     Croatia                   300
2  Montenegro                   116

For GALSI Pipeline  (P0697), mismatch in countries
List in GFIT: ['Algeria', 'Italy']
GIS analysis:
   Country  length km in country
0  Algeria                   126
1   France                    64
2    Italy                   611
3  Tunisia                    35

For Vaca Muerta-Brazil Pipeline  (P2713), mismatch in countries
List in GFIT: ['Argentina', 'Brazil']
GIS analysis:
     Country  length km in country
0  Argentina                  1530
1     Brazil                   733
2    U

For BRUA Gas Pipeline Phase 3 (P2729), mismatch in countries
List in GFIT: ['Romania']
GIS analysis:
   Country  length km in country
0  Hungary                     8
1  Romania                   759

For Romania-Moldova Gas Pipeline Phase II (P2747), mismatch in countries
List in GFIT: ['Moldova', 'Romania']
GIS analysis:
   Country  length km in country
0  Moldova                   119

For Siret-Khotyn Gas Pipeline  (P1773), mismatch in countries
List in GFIT: ['Romania', 'Ukraine']
GIS analysis:
   Country  length km in country
1  Ukraine                    78

For Nord Stream 2 Gas Pipeline  (P0752), mismatch in countries
List in GFIT: ['Germany', 'Russia']
GIS analysis:
   Country  length km in country
0  Denmark                   224
1  Finland                   701
2  Germany                   125
3   Russia                   207
4   Sweden                   647

For Trans-Korea Gas Pipeline  (P1440), mismatch in countries
List in GFIT: ['South Korea']
GIS analysis:
       Coun

In [None]:
# TO DO: compare calculated length vs stated length. 
# Look into those with large discrepancies - over 50% difference

#BL:
#look at where calculated length 2x more than stated length - 90 total
#where route way too short too (half)
#some eia routes are very complex; need to bring in more complicated routes (e.g., in USA?)

In [None]:
# TO DO: Look for outlier values for length and diameter to try to catch errors.

In [None]:
# TO DO: Check capacity expansion projects: 
# Make sure all capacity expansion projects (which don’t involve laying more pipe) 
# have “capacity expansion only” in the column “Route”, which would make them not show up on the map. 
# Also make sure they don’t have a length listed.