# Modules

In [2]:
import pandas as pd
import numpy as np
import unidecode
from difflib import SequenceMatcher
import salem
import matplotlib.pyplot as plt 
import re
import os
% matplotlib inline

# File paths

In [475]:
f_path = 'C:\\Users\\jlandman\\Desktop\\database_Fischer_et_al._2015_The_Cryosphere.txt'                 # Fischer database with swiss coordinates 
fll_path = 'C:\\Users\\jlandman\\Desktop\\SGI2010wgs84_shapefiles\\parameters_SGI2010.csv'               # Fischer database with lat/lon
a_path = 'C:\\Users\\jlandman\\Desktop\\DOI-WGMS-FoG-2015-11\\WGMS-FoG-2015-11-A-GENERAL-INFORMATION.csv'# FoG: A GENERAL
b_path = 'C:\\Users\\jlandman\\Desktop\\DOI-WGMS-FoG-2015-11\\WGMS-FoG-2015-11-B-STATE.csv'              # FoG: B STATE
d_path = 'C:\\Users\\jlandman\\Desktop\\DOI-WGMS-FoG-2015-11\\WGMS-FoG-2015-11-D-CHANGE.csv'             # FoG: D CHANGE

# Read files

In [476]:
pdf = pd.read_csv(f_path, sep = '\t', encoding='iso-8859-1')
pdfll = pd.read_csv(fll_path, sep= ';', encoding='iso-8859-1', usecols=[2,3,6,14,15]) # we only need ID and x/y here
pda = pd.read_csv(a_path, encoding='iso-8859-1')
pdb = pd.read_csv(b_path, encoding='iso-8859-1')
pdd = pd.read_csv(d_path, encoding='iso-8859-1')

# Preselect FoG IDs in Switzerland

In [477]:
pda = pda[pda.POLITICAL_UNIT == 'CH']
pdb = pdb[pdb.WGMS_ID.isin(pda.WGMS_ID.values)]
pdd = pdd[pdd.WGMS_ID.isin(pdd.WGMS_ID.values)]

# Haversine function

In [483]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between one point 
    on the earth and an array of points (specified in decimal degrees)
    """
    
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in meters
    return c * r

# Some settings

In [484]:
begin_new_id = 7000  # where to start with assigning new WGMS IDs
new_id_range = range(begin_new_id, begin_new_id+len(pdf)+1,1)
new_id_range

range(7000, 8386)

# Correct missing underscores and column names

In [485]:
pdf = pdf.rename(columns={'uncertainty_dvol_between_t1_and_t2_mio m3': 'uncertainty_dvol_between_t1_and_t2_mio_m3'})
pdfll = pdfll.rename(columns={'uncertainty_dvol_between_t1_and_t2_mio m3': 'uncertainty_dvol_between_t1_and_t2_mio_m3'})

pdfll = pdfll.rename(columns={'Unnamed: 15': 'Glacier_name_SGI2010'})

# Find matching glaciers based on FoG => compare names, area and location

### Dictionary of strings and symbols that need to be replaced prior to calculation of similarity score

In [486]:
name_adjust = {'gletscher':'g', 'glacier':'g' , 'vadret':'v', 'ghiacciaio':'g', 'vedretta':'v', 'ferner':'f', 'ä':'ae', 'ö':'oe', 'ü':'ue', 'é':'e', 'à':'a', 'è':'e', 'ô':'o', 'ß':'ss'} 

### Introduce FoG column with new names

In [487]:
pda['FoG_compname'] = ''

for initname in [n for n in pda.NAME.values]:
    name = initname.lower()
    for key in name_adjust:
        if key in name:
            name = name.replace(key, name_adjust[key])
    #pda.FoG_compname[pda.NAME == initname] = name
    pda.loc[pda.NAME == initname, 'FoG_compname'] = name

### Introduce some new columns

In [488]:
pdf.COMPNAME = ''              # simplified name 
pda.MATCH_RATIO = np.nan       # string matching ratio 
pda.MATCH_NAME = ''            # Name of the FoG glacier that matches Mauro's name best
pda.CLOSEST = ''               # closest FoG glacier point
pda.DIST_TO_CLOSEST = np.nan   # distance of Mauro's point to the closest FoG point 
pda.AREA_CLOSEST = np.nan      # column with the area of Mauro's glacier found by string matching 
pda.AREA_MATCH = np.nan        # column with the area of Mauro's glacier found by string matching
pda.AREA = np.nan

### Adjust also Mauro's names to make them comparable

In [489]:
for idx,cols in pdf.iterrows():
    # simplify name
    compname_mau = cols.Glacier_name_SGI2010.lower()
    for key in name_adjust:
        if key in compname_mau:
            # replace the umlauts etc.
            compname_mau = compname_mau.replace(key, name_adjust[key])
    
    # delete the bracket stuff in order to improve the ratio
    start = compname_mau.find('(')
    if start != -1:
        compname_mau = compname_mau[0:start]
    compname_mau = compname_mau.replace('*', '')
    compname_mau = compname_mau.strip()

    pdf.loc[pdf.Glacier_name_SGI2010 == cols.Glacier_name_SGI2010, 'COMPNAME'] = compname_mau

### Find matching glaciers for the 159 swiss glaciers in PDA

for fidx,fcols in pda[0:10].iterrows():
    
    # create an AREA column entry (from the "state" table)
    try: # take the latest area entry
        area_match = pdb[pdb.WGMS_ID == fcols.WGMS_ID].AREA.values[~np.isnan(pdb[pdb.WGMS_ID == fcols.WGMS_ID].AREA.values)][-1]
    except IndexError:
        area_match = np.nan
    pda.loc[pda.WGMS_ID == fcols.WGMS_ID, 'AREA'] = area_match
    
    
    # find biggest ratio of string matching and insert
    ratio = np.nan
    name = ''
    for cname in pdf['COMPNAME'].values:
        
        curr_ratio = SequenceMatcher(None, fcols.FoG_compname, cname).ratio()
        
        if curr_ratio > ratio or pd.isnull(ratio):  #the latter in order to catch the initial case
            ratio = curr_ratio
            name = cname
            
    pda.loc[pda.NAME == fcols.NAME, 'MATCH_RATIO'] = ratio
    pda.loc[pda.NAME == fcols.NAME, 'MATCH_NAME'] = name
    
    # insert the area (at t2, because this is the latest) of the glacier found by string matching
    pda.loc[pda.NAME == fcols.NAME, 'AREA_MATCH'] = pdf[pdf['COMPNAME'] == name].area_t2_km2.iloc[0]
    
    
    #find closest pdf glacier
    dist = np.nan
    close_name = ''
    for pdf_idx, pdf_cols in pdf.iterrows():
        lat = pdfll[pdfll.Glacier_name_SGI2010 == pdf_cols.Code_SGI2010]['y WGS84)'].values[0]
        lon = pdfll[pdfll.Glacier_name_SGI2010 == pdf_cols.Code_SGI2010]['Location(x WGS84)'].values[0]
        curr_dist = haversine(lon, lat, fcols.LONGITUDE, fcols.LATITUDE)
        
        if curr_dist < dist or pd.isnull(dist): # the second is for the initial loop
            dist = curr_dist
            close_name = pdf_cols.COMPNAME
    
    pda.loc[pda.NAME == fcols.NAME, 'DIST_TO_CLOSEST'] = dist
    pda.loc[pda.NAME == fcols.NAME, 'CLOSEST'] = close_name
    
    print(fcols.NAME)
    # insert the area (at t2, because this is the latest) of the glacier ehic is found to be the closest
    pda.loc[pda.NAME == fcols.NAME, 'AREA_CLOSEST'] = pdf[pdf['COMPNAME'] == close_name].area_t2_km2.iloc[0]

In [490]:
pdfll.COMPNAME = ''              # simplified name 

In [491]:
for idx,cols in pdfll.iterrows():
    # simplify name
    compname_mau = cols.Names.lower()
    for key in name_adjust:
        if key in compname_mau:
            # replace the umlauts etc.
            compname_mau = compname_mau.replace(key, name_adjust[key])
    
    # delete the bracket stuff in order to improve the ratio
    start = compname_mau.find('(')
    if start != -1:
        compname_mau = compname_mau[0:start]
    compname_mau = compname_mau.replace('*', '')
    compname_mau = compname_mau.strip()

    pdfll.loc[pdfll.Names == cols.Names, 'COMPNAME'] = compname_mau
  

In [492]:
#if os.path.exists('assigned_automated.csv'):
#    pass

#else:
for fidx,fcols in pda.iterrows():

    # create an AREA column entry (from the "state" table)
    area_match = np.nan
    try: # take the latest area entry
        area_match = pdb[pdb.WGMS_ID == fcols.WGMS_ID].AREA.values[~np.isnan(pdb[pdb.WGMS_ID == fcols.WGMS_ID].AREA.values)][-1]
    except IndexError:
        area_match = np.nan
    pda.loc[pda.WGMS_ID == fcols.WGMS_ID, 'AREA'] = area_match


    # find biggest ratio of string matching and insert
    ratio = 0.0
    name = ''
    for cname in pdfll['COMPNAME'].values:
        curr_ratio = SequenceMatcher(None, cname, fcols.FoG_compname).ratio()

        if curr_ratio > ratio or ratio==0.0:  #the latter in order to catch the initial case
            ratio = curr_ratio
            name = cname

    pda.loc[pda.NAME == fcols.NAME, 'MATCH_RATIO'] = ratio
    pda.loc[pda.NAME == fcols.NAME, 'MATCH_NAME'] = name

    # insert the area (at t2, because this is the latest) of the glacier found by string matching
    pda.loc[pda.NAME == fcols.NAME, 'AREA_MATCH'] = pdfll[pdfll['COMPNAME'] == name]['area(km2)'].iloc[0]


    #find closest pdf glacier
    dist = np.nan
    close_name = ''
    for pdf_idx, pdf_cols in pdfll.iterrows():
        lat = pdfll[pdfll.Names == pdf_cols.Names]['y WGS84)'].values[0]
        lon = pdfll[pdfll.Names == pdf_cols.Names]['Location(x WGS84)'].values[0]
        curr_dist = haversine(lon, lat, fcols.LONGITUDE, fcols.LATITUDE)

        if curr_dist < dist or pd.isnull(dist): # the second is for the initial loop
            dist = curr_dist
            close_name = pdf_cols.COMPNAME

    pda.loc[pda.NAME == fcols.NAME, 'DIST_TO_CLOSEST'] = dist
    pda.loc[pda.NAME == fcols.NAME, 'CLOSEST'] = close_name

    print(fcols.NAME)
    # insert the area (at t2, because this is the latest) of the glacier ehic is found to be the closest
    pda.loc[pda.NAME == fcols.NAME, 'AREA_CLOSEST'] = pdfll[pdfll['COMPNAME'] == close_name]['area(km2)'].iloc[0]


    pda.to_csv('assigned_automated.csv')

A NEUVE GL. L'
ADLER
ALBIGNA
ALLALIN
ALPETLI(KANDER)
ALTELS
AMMERTEN
AROLLA (BAS)
BALMHORN
BASODINO
BELLA TOLA
BIDER
BIFERTEN
BIRCH
BIS
BLUEMLISALP
BODMER
BOVEYRE
BREITHORN
BRENEY
BRESCIANA
BRUNEGG
BRUNNI
CALDERAS
CAMBRENA
CAVAGNOLI
CHEILLON
CLARIDENFIRN
CORBASSIERE
CORNO
CROSLINA
DAMMA
DOLENT GL. DU
DUNGEL
EIGER
EIGER (WEST)
EN DARREY
FEE NORTH
FERPECLE
FIESCHER
FINDELEN
FIRNALPELI
FLUCHTHORN GL.
FORNO
GAMCHI
GAULI
GELTEN
GIETRO
GLAERNISCH
GORNER
GRAND DESERT
GRAND PLAN NEVE
GRIES
GRIESS(KLAUSEN)
GRIESSEN(OBWA.)
GRIESSERNU
GROSSER ALETSCH
GRUBEN
GUTZ
HANGENDE
HINDRE SCHMADRI
HOHLAUB
HOMATTU
HUEFI
KALTWASSER
KEHLEN
KESSJEN
LAEMMERN (WILDSTRUBEL)
LANG
LAVAZ
LENTA
LIMMERN
LISCHANA
MAIGHELS EAST BRANCH
MAIGHELS WEST BRANCH
MARTINETS
MINSTIGER
MITTELALETSCH
MOIRY
MOMING
MONT DURAND
MONT FORT (ZINAL)
MONT MINE
MONTO MORO GL.
MORTERATSCH
MURTEL
MUTT
MUTTEN
OB.GRINDELWALD
OBERAAR
OBERALETSCH
OFENTAL
OTEMMA
PALUE
PANEYROSSE
PARADIES
PARADISINO (CAMPO)
PIERREDAR
PIZOL
PLAINE MORTE
PLATTALVA
POR

In [493]:
pda.to_csv('assigned_automated.csv')

In [1251]:
pda[['NAME', 'FoG_compname', 'MATCH_RATIO', 'MATCH_NAME', 'DIST_TO_CLOSEST', 'CLOSEST', 'AREA', 'AREA_MATCH', 'AREA_CLOSEST']][151:160]

Unnamed: 0,NAME,FoG_compname,MATCH_RATIO,MATCH_NAME,DIST_TO_CLOSEST,CLOSEST,AREA,AREA_MATCH,AREA_CLOSEST
844,WALLENBUR,wallenbur,0.818182,wallenburfirn,485.303345,brunnenfirn-s,1.7,1.41688,0.27859
845,WANNENHORN GL.,wannenhorn gl.,0.785714,wannenhorng-nw,771.817254,senfspitze-s,,0.49312,0.043
846,WANNENHORN GL. N,wannenhorn gl. n,0.8,wannenhorng-nw,898.209197,wannenhorng-nw,,0.49312,0.49312
847,WANNENHORN GL. S,wannenhorn gl. s,0.8,wannenhorng-se,771.817254,senfspitze-s,,0.38984,0.043
848,WITENWASSEREN,witenwasseren,0.962963,witenwassereng,126.686212,witenwasseren-w,,0.62719,0.13125
849,ZENBAECHEN GL.,zenbaechen gl.,0.88,zenbaecheng,963.879888,driestg,,0.85891,2.02625
850,ZINAL,zinal,0.666667,zinal g de,1232.717294,bouquetins-e,16.24,13.36,0.02552
851,ZMUTT,zmutt,0.909091,zmuttg,2320.009358,pointe de zinal-s-ii,17.22,13.73688,0.00343


In [1304]:
i = 'ZMUTT'

In [1305]:
ID = pda[pda.NAME == i].WGMS_ID.iloc[0]

In [1306]:
pda[pda.NAME == i]#[['LATITUDE', 'LONGITUDE']]

Unnamed: 0,POLITICAL_UNIT,NAME,WGMS_ID,RIVER_BASIN,FREE_POSITION,LOCAL_CODE,LOCAL_PSFG,GEN_LOCATION,SPEC_LOCATION,LATITUDE,...,GEO-REGION_CODE,GEO-SUBREGION_CODE,FoG_compname,AREA,MATCH_RATIO,MATCH_NAME,AREA_MATCH,DIST_TO_CLOSEST,CLOSEST,AREA_CLOSEST
851,CH,ZMUTT,390,N0125,7,15,15,WESTERN ALPS,RHONE BASIN,46.0,...,CEU,CEU-01,zmutt,17.22,0.909091,zmuttg,13.73688,2320.009358,pointe de zinal-s-ii,0.00343


In [1307]:
pdfll[pdfll['COMPNAME'] == pda[pda.NAME==i].MATCH_NAME.iloc[0]]

Unnamed: 0,Location(x WGS84),y WGS84),area(km2),Names,Glacier_name_SGI2010,COMPNAME
981,7.59347,45.9795,13.73688,ZMUTTGLETSCHER (Nr. 15),B57/05,zmuttg


In [1308]:
pdb[pdb.where(~pd.isnull(pdb.AREA)).WGMS_ID == ID].tail(10)#[['NAME', 'LENGTH', 'AREA']]

Unnamed: 0,POLITICAL_UNIT,NAME,WGMS_ID,YEAR,HIGHEST_ELEVATION,MEDIAN_ELEVATION,LOWEST_ELEVATION,ELEVATION_UNC,LENGTH,LENGTH_UNC,AREA,AREA_UNC,SURVEY_DATE,SURVEY_PLATFORM_METHOD,PUB_IN_FOG,INVESTIGATOR,SPONS_AGENCY,REFERENCE,REMARKS
3319,CH,ZMUTT,390,1973,,2980.0,,,8.5,,17.22,,19739999.0,,1975.0,,,,
3320,CH,ZMUTT,390,1975,4100.0,2980.0,2230.0,,8.5,,17.22,,19759999.0,,1975.0,,,,


In [1309]:
pdfll[pdfll['Names'].str.contains('zmu', case=False)]

Unnamed: 0,Location(x WGS84),y WGS84),area(km2),Names,Glacier_name_SGI2010,COMPNAME
981,7.59347,45.9795,13.73688,ZMUTTGLETSCHER (Nr. 15),B57/05,zmuttg
991,7.65154,45.981201,0.03553,Zmuttgrat-W*,B57/20n,zmuttgrat-w


## A dictionary with links from FoG to Mauro's database (full names in FoG to full and short names in Mauro's DB)

In [1124]:
links = {
    'A NEUVE GL. L\'':('',''),   #unclear (no area)
    'ADLER':('ADLERGLETSCHER (Teilgl. von B56/03)','adlerg'),  # not proven (no area in FoG), but quite obvious
    'ALBIGNA':('ALBIGNA VADREC D\' (Nr. 116)','albigna vadrec d\''),  # not proven, maybe error in area in pdb  
    'ALLALIN':('Allalingletscher* (Teilgl. von B52/66n)','allaling'), #ok
    'ALPETLI(KANDER)':('KANDERFIRN (Teilgl. von A55B/29n; Nr. 109)','kanderfirn'),  # ok (area:14/12)
    'ALTELS':('',''),     # no equivalent for whole glacier. there is Altels-S, Altels-NW and Altels-SE
    'AMMERTEN':('',''),   # no equivalent for whole glacier. there is Ammerten-W and AMMERTEN
    'AROLLA (BAS)':('',''),  # no equivalent
    'BALMHORN':('BALMHORNGLETSCHER (Teilgl. von A55B/42n)','balmhorng'),  # ok (area: 1.7/1.9)
    'BASODINO':('BASODINO GH. DEL (Nr. 104)','basodino gh. del'),  # ok (area:1.84/1.89). What to do with basodino-N and basodino-NW?
    'BIDER':('BIDERGLETSCHER','biderg'),  # ok (no area in FoG, but unique)
    'BIFERTEN':('BIFERTENFIRN (Nr. 77)','bifertenfirn'),  # ok (area: 2.5/2.86)
    'BIRCH':('',''),   # glacier has two parts; difficult (area:0.54/0.22, but only one entry each)
    'BIS':('BISGLETSCHER (Nr. 107)','bisg'),    # ok even though area 4.79/3.82
    'BLUEMLISALP':('BLÜMLISALPGLETSCHER (Nr. 64)','bluemlisalpg'), # ok area 2.22/2.98
    'BODMER':('BODMER','bodmerg'),   # Link should be okay but area 0.32/0.64
    'BOVEYRE':('BOVEIRE GLACIER DE (Nr. 41)','boveire g de'),  # ok (area 1.62/1.99)
    'BREITHORN':('',''),   # unclear (too many entries in Mauro's DB)
    'BRENEY':('BRENAY GLACIER DU (Nr. 36)','brenay g du'),   # ok (areas 9.8/7.1)
    'BRUNEGG':('BRUNEGGGLETSCHER (Teilgl. von B60/09; Nr. 20)','bruneggg'),  #ok (areas 6.1/5.5)
    'BRUNNI':('BRUNNIFIRN (Nr. 72)','brunnifirn'),  # ok (areas 2.99/2.31)
    'CALDERAS':('CALDERAS VADRET (Nr. 95)','calderas v'),   # ok even though areas 1.2/0.66
    'CAMBRENA':('Cambrena Vadret dal (Teilgl. von C93/09)','cambrena v dal'),  # probably right, but what about Cambrena-E* (Teilgl. von C93/09)? C93/09 not in SGI2010! 
    'CAVAGNOLI':('CAVAGNÖÖ GH. DEL (Nr. 119)','cavagnoeoe gh. del'), # probably right due to lat/lon agreement even though area 1.32/0.44
    'CHEILLON':('CHEILON GLACIER DE (Nr. 29)','cheilon g de'), # okay even though area 4.73/3.6. Spelling?
    'CLARIDENFIRN':('',''),  # unclear case => too many equivalent in Mauro's DB and no area/length in FoG
    'CORBASSIERE':('Corbassière (Teilgl. von B83/03)*','corbassiere'),  # unclear if also Combin de Corbassière-E (Teilgl. von B83/03)* is meant (area: 0.4km2)
    'CORNO':('CORNO GH. DEL (Nr. 120)','corno gh. del'),  # ok even though area 0.27/0.1
    'DAMMA':('DAMMAGLETSCHER (Nr. 70)','dammag'),  # ok even though area 6.32/4.24
    'DOLENT GL. DU':('DOLENT GLACIER DU','dolent g du'),  # ok even though noe area given in FoG
    'DUNGEL':('TUNGELGLETSCHER (Teilgl. von A56D/09n, Nr. 112)','tungelg'),  #ok area 1.2/0.93
    'EIGER':('',''),   # unclear (areas don't match)
    'EIGER (WEST)':('',''),  # unclear (areas don't match)
    'EN DARREY':('EN DARREY GLACIER DE L\' (Nr. 30)','en darrey g de l\''), #ok (areas 1.86/1.28)
    'FEE NORTH':('',''),  # unclear (probably even more than those mentioned in Mauro's DB)
    'FERPECLE':('FERPÈCLE GLACIER DE (Nr. 25)','ferpecle g de'),  # ok (areas 9.79/9.0)
    'FIESCHER':('FIESCHERGLETSCHER VS (Teilgl. von B40/14n, Nr. 4)','fiescherg vs'), # probably ok due to area
    'FIRNALPELI':('',''),  # three entries: FIRNALPELIGLETSCHER-E (Nr. 75), FIRNALPELIGLETSCHER-W, FIRNALPELIFIRN
    'FORNO':('FORNO VADREC DEL (Nr. 102)','forno vadrec del'), # evtl. auch noch Monte del Forno* (0.06), Sella del Forno* (0.02), aber NICHT Ofenhorn-W* (lat/lon!)
    'GAMCHI':('GAMCHIGLETSCHER (Nr. 61)','gamchig'),  # ok area (1.73/1.23)
    'GAULI':('GAULIGLETSCHER (Teilgl. von A54I/19n) Nr. 52','gaulig'),  # ok (area 13.7/11.4)
    'GELTEN':('GELTENGLETSCHER-E (Nr. 113)','gelteng-e'),  # ok (area 1.17/0.81) but maybe also GELTENGLETSCHER-W (area 0.45)
    'GIETRO':('GIETRO GLACIER DU (Nr. 37)','gietro g du'),  # ok (area 5.54/5.16)
    'GLAERNISCH':('GLÄRNISCHFIRN (Nr. 80)','glaernischfirn'), # ok (area 2.09/1.41)
    'GORNER':('',''),  # problem: gorner not in Mauro's DB; Grenz is too big (whole glacier)
    'GRAND PLAN NEVE':('PLAN NEVE-W','plan neve-w'), # must be plan neve-w (area 0.2/0.18), taking also plan neve-e (0.12km2) would exceed FoG area
    'GRIES':('GRIESGLETSCHER (Nr. 3)','griesg'),  # ok (area 4.83/4.78)
    'GRIESSEN(OBWA.)':('GRIESSENFIRN','GRIESSENFIRN'),  # ok (area 1.27/0.86)
    'GRIESSERNU':('GRIESSERNU GLETSCHER','griessernu g'), # ok even though no area in FoG
    'GROSSER ALETSCH':('GROSSER ALETSCH GLETSCHER (Teilgl. von B36/49n) Nr. 5','grosser aletsch g'), # ok (area 81.3/78.3)
    'GRUBEN':('',''), # should be all GRÜEBUGLETSCHER-N-II (Teilgl. von B51/17n), GRÜEBUGLETSCHER-S (Teilgl. von B51/17n), GRÜEBUGLETSCHER-N-I (Teilgl. von B51/17n), NAME SHOULD  BE CHANGED IN FoG
    'GUTZ':('GUTZGLETSCHER','gutzg'),  # ok even though no area in FoG
    'HANGENDE':('HANGENDE GLETSCHER','hangende g', 'B52/27'), # ok due to lat/lon
    'HOHLAUB':('',''), # both Hohlaubgrat-E* (Teilgl. von B52/67n) and Hohlaub Gletscher* (Teilgl. von B52/67n)
    'HOMATTU':('',''), #  HOMATTUGLETSCHER-II and HOMATTUGLETSCHER-I
    'HUEFI':('HÜFIFIRN (Nr. 73)','huefifirn'),  # ok (area 13.73/12.72)
    'KALTWASSER':('CHALTWASSERGLETSCHER (Nr. 7)','chaltwasserg'),  # ok (areas 18.5/1.49)
    'KEHLEN':('CHELENGLETSCHER (Totalgl.; Nr. 68)','cheleng'),  # ok even though area 1.73/3.15
    'KESSJEN':('CHESSJENGLETSCHER-E (Nr. 12)','chessjeng-e'), #ok (areas 0.19/0.19)
    'LAEMMERN (WILDSTRUBEL)':('WILDSTRUBELGLETSCHER (Teilgl. von A55C/24n) Nr. 63',''), # ok even though area 3.15/2.34
    'LANG':('Langgletscher (Totalgl.; Nr. 18)','langg'),  # ok areas 10.03/8.26
    'LAVAZ':('',''),  # LAVAZ GLATSCHER DA (Nr. 82) and Lavaz-W*
    'LIMMERN':('LIMMERNFIRN (Nr. 78)','limmernfirn'), # ok (area 2.41/1.89)
    'LISCHANA':('TRIAZZA VADRET DA','triazza v da'),  # ok v d triazza is one of the two remnants (one is no longer mapped)
    'MAIGHELS EAST BRANCH':('MAIGHELS GLATSCHER DA-E','maighels glatscher da-e'),  # ok but no area in FoG
    'MAIGHELS WEST BRANCH':('MAIGHELS GLATSCHER DA-W','maighels glatscher da-w'),  # ok but no area in FoG
    'MARTINETS':('MARTINETS GLACIER DES (Nr. 46)','martinets g des'),
    'MINSTIGER':('MINSTIGERGLETSCHER','minstigerg'),  # ok areas 3.09/2.25
    'MITTELALETSCH':('MITTELALETSCHGLETSCHER (Teilgl. von B36/49n) Nr. 106','mittelaletschg'),
    'MOIRY':('MOIRY GLACIER DE (Nr. 24)','moiry g de'), # ok area 6.11/4.89
    'MOMING':('MOMING GLACIER DE (Nr. 23)','moming g de'),  # ok , maybe also Pointe Nord de Moming-SE* and Blanc de Moming-W*
    'MONT FORT (ZINAL)':('',''),  # not clear, probably PETIT M. FORT GLACIER DU (B75/07), but area is still too low
    'MONT MINE':('MONT MINÉ GLACIER DU (Nr. 26)','mont mine g du'), # ok areas 10.3/9.9
    'MONTO MORO GL.':('Monte Moro-W*','monte moro-w'),  # only a remnant obviously
    'MORTERATSCH':('MORTERATSCH VADRET DA (Totalgl.; Nr. 94)','morteratsch v da'), # ok areas 17/15
    'MURTEL':('MURTEL VADRET DAL','murtel v dal', 'E23/16'),  # ok, strange second polygon E24/04
    'MUTT':('MUTTGLETSCHER (Nr. 2)','muttg'), # ok, area 0.57/0.36
    'MUTTEN':('Muttengletscher* (Teilgl. von A51E/23)','mutteng'), #ok, no areas in FoG
    'OB.GRINDELWALD':('OBERER GRINDELWALDGLETSCHER (Nr. 57)','oberer grindelwaldg'), # ok areas 10.07/8.41
    'OBERAAR':('OBERAARGLETSCHER (Teilgl. von A54G/35n) Nr. 50','oberaarg'), # ok areas 5.23/4.10
    'OFENTAL':('OFENTAL GLETSCHER (Nr. 9)','ofental g'),  #ok even though areas 0.4/0.04...possibly one remnant missing in Mauro's DB (see swisstopo)
    'OTEMMA':('Otemma (Teilgl. von B82/27)*','otemma'),  # ok areas 16.55/12.59
    'PALUE':('Palü Vadret da (Teilgl. von C93/04)','palue v da'),  # ok areas 6.62/5.26
    'PANEYROSSE':('PANEIROSSE GLACIER DE (Nr. 44)','paneirosse g de'),  # ok areas 0.45/0.3
    'PARADIES':('PARADIESGLETSCHER (Nr. 86)','PARADIESGLETSCHER (Nr. 86)'),  # ok 4.6/1.8
    'PARADISINO (CAMPO)':('',''),  # unclear, possibly CAMP VEDREIT DA (Nr. 101)
    'PIERREDAR':('PIERREDAR GLACIER DE (Nr. 49)','pierredar g de'), # ok areas 0.67/0.3
    'PIZOL':('PIZOLGLETSCHER (Nr. 81)','pizolg'), # ok areas 0.32/0.08
    'PLAINE MORTE':('',''), # multiple: GLACIER DE LA PLAINE MORTE (Nr. 65), PLAINE MORTE-W GLACIER DE LA and PLAINE MORTE-E GLACIER DE LA, area in FoG missing!!!
    'PORCHABELLA':('PORCHABELLA VADRET DA (Nr. 88)','porchabella v da'),  # ok area 2.59/1.67
    'PRAPIO':('PRAPIO GLACIER DU (Nr. 48)','prapio g du'),   # ok area 0.36/0.21
    'PUNTEGLIAS':('Fuorcla Punteglias*','fuorcla punteglias'),  #  ok area 0.93/0.25 => debris cover problem
    'RAETZLI':('',''),  # no equivalent in Mauro's DB...only plaine morte present
    'RHONE':('Rhonegletscher* (Teilgl. von B43/03)','rhoneg'),  # ok areas 15.8/15.31
    'RIED':('RIEDGLETSCHER (Nr. 17)','riedg'),  # ok areas 8.26/7.31
    'ROSEG':('ROSEG VADRET DA (Nr. 92)','roseg v da'),  # ok areas 8.71/6.81
    'ROSENLAUI':('ROSENLAUIGLETSCHER (Nr. 56)','rosenlauig'),  # ok areas 5.9/5.4
    'ROSSBODEN':('ROSSBODEGLETSCHER (Nr. 105)','rossbodeng'),  # ok areas 1.89/1.18
    'ROTTAL':('ROTTALGLETSCHER-NW (Teilgl. von B52/02)','rottalg-nw'),  #  ok no area in FoG
    'SALEINA':('',''),  # is Saleina* (Teilgl. von B85/16), but area is too high (6.54) compared to FoG (5.03)
    'SANKT ANNA':('ST. ANNAFIRN (Nr. 67)','st. annafirn'),  # ok even though areas 0.41/0.22 (might be debris problem)
    'SARDONA':('',''),  # should be Sardonagletscher-II*, but area is too high (0.45, FoG only 0.38)
    'SCALETTA':('SCALETTAGLETSCHER (Nr. 115)','scalettag'),  # ok even though area 0.66/0.21 (debris?)
    'SCHOENBUEHL GL.':('',''),  # is probably both SCHÖNBÜHLGLETSCHER-SE and SCHÖNBÜHLGLETSCHER-NW (no FoG area given)
    'SCHWARZ':('SCHWARZGLETSCHER (Nr. 62)','schwarzg'),  #  okay areas 1.6/1.09
    'SCHWARZBACH':('SCHWARZBACHFIRN','schwarzbachfirn'),  #  okay (not area in FoG)
    'SCHWARZBERG':('Schwarzberggletscher* (Teilgl. von B52/24)','schwarzbergg'),  #  ok area 5.17/5.33
    'SESVENNA':('Sesvenna Vadret da-E (Teilgl. von E03/04)','sesvenna v da-e'),  #  ok area 0.67/0.33
    'SEX ROUGE':('SEX ROUGE GLACIER DU (Nr. 47)','sex rouge g du'), # ok even though area 0.72/0.27
    'SILLERN':('SILLERE GLETSCHER','sillere g'),  #  ok (no area in FoG)
    'SILVRETTA':('SILVRETTAGLETSCHER (Nr. 90)','silvrettag'),  # ok areas 2.74/2.67
    'SIRWOLTE':('',''),  #unclear: might be norther polygon of Griessernuhorn-N* (c03/04)
    'STEIN':('STEINGLETSCHER (Nr. 53)','steing'),  # ok even though area in Mauro's DB (5.68) slightly bigger than in FoG (5.6)
    'STEINLIMMI':('STEINLIMIGLETSCHER (Nr. 54)','steinlimig'),  # ok areas 2.21/1.59
    'SULZ':('',''),  #  must be HINTERSULZFIRN (Nr. 79), A50I/02,  (lat/lon), but area is bigger (0.26) than in FoG (0.2)
    'SURETTA':('',''), # must be SURETTAGLETSCHER-E (Piz Por*), SURETTAGLETSCHER-W (Hauptgl., Nr. 87) and maybe also Suretta Lückli*. FoG point unclear
    'TIATSCHA':('TIATSCHA VADRET (La Cudera, Nr. 96)','tiatscha v'), # okay areas 2.11/1.82
    'TIEFEN':('TIEFENGLETSCHER (Nr. 66)','tiefeng'), # ok even though area 3.17/1.99
    'TOURNELON BLANCE':('',''),  # should be all (no FoG area): Tournelon Blanc-SE*, Tournelon Blanc-E*, Tournelon Blanc-NE* and Col du Tournelon Blanc*
    'TRIENT':('TRIENT GLACIER DU (Nr. 43)','trient g du'),  # ok area 6.58/5.82
    'TRIEST GL.':('DRIESTGLETSCHER','driestg'),  # ok no area in FoG
    'TRIFT (GADMEN)':('TRIFTGLETSCHER (Nr. 55)','triftg'),  # ok area 15.33/14.9
    'TSANFLEURON':('TSANFLEURON GLACIER DE (Nr. 33)','tsanfleuron g de'), 3.78/2.64
    'TSCHIERVA':('',''),  # probably both TSCHIERVA VADRETTIN DA (0.41) and TSCHIERVA VADRET DA (Nr. 93) (5.09), FoG area is 6.83
    'TSCHINGEL':('',''),  # is TSCHINGELFIRN (Nr. 60) (A54/M21),  and maybe also Tschingelspitz-S* (A54M/51n) and Tschingelgrat-S* (A54M/52n)
    'TSIDJIORE NOUVE':('TSIJIORE NOUVE GLACIER DE (Nr. 28)','tsijiore nouve g de'), # ok even though area 3.12/2.72
    'TURTMANN (WEST)':('',''),  # unclear: TURTMANNGLETSCHER (Teilgl. von B60/09) is only about half (5.5) of turtmann-w in FoG (11km2)
    'UNT.GRINDELWALD':('',''),  # might be OBERS ISCHMEER (Teilgl. von A54L/19)
    'UNTERAAR':('UNTERAARGLETSCHER (Teilgl.von A54G/50n) Nr. 51','unteraarg'), # ok area 22.7/22.5
    'VAL TORTA':('VALTORTA VADRET','valtorta v'),  # ok area 0.17/0.06
    'VALLEGGIA':('VALLEGGIA GH. DI (Nr. 117)','valleggia gh. di'),  # ok area 0.59/0.30
    'VALSOREY':('Valsorey (Teilgl. von B84/15)*','valsorey'),  #  ok area 2.34/1.9
    'VERSTANKLA':('Verstanclagletscher (Teilgl. von A10G/08)','verstanclag'), #ok area 1.06/0.71
    'VORAB':('VORAB GLATSCHER DIL (Nr. 85)','vorab glatscher dil'), # ok area 2.51/1.22, could also be Vorabsattel-W* (but drain ins other valley)
    'WALLENBUR':('WALLENBURFIRN (Nr. 71)','wallenburf'), # ok area 1.7/1.41
    'WANNENHORN GL.':('',''),  # should be both WANNENHORNGLETSCHER-NW (Teilgl. von B36/57n) and WANNENHORNGLETSCHER-SE (Teilgl. von B36/57n) together...check PARENT ID!!!
    'WANNENHORN GL. N':('WANNENHORNGLETSCHER-NW (Teilgl. von B36/57n)','wannenhorng-nw'), # must be this one due to FoG lat/lon
    'WANNENHORN GL. S':('WANNENHORNGLETSCHER-SE (Teilgl. von B36/57n)','wannenhorng-se'), # must be this one due to FoG lat/lon
    'WITENWASSEREN':('WITENWASSERENGLETSCHER','witenwassereng'), # no FoG area given....cloud also include Witenwasseren-W*(but drains in another valley)
    'ZENBAECHEN GL.':('ZENBAECHENGLETSCHER','zenbaecheng'), # no FoG area given
    'ZINAL':('ZINAL GLACIER DE (Nr. 22)','zinal g de'),  # ok area 16/13.3 debris?
    'ZMUTT':('ZMUTTGLETSCHER (Nr. 15)','zmuttg'),  # ok area 17.4/13.7 debris?
    
}

# Have a look at the area (Mauro) / area (FoG) difference for the matching glaciers

In [None]:
pdf['match_area_isclose'] = np.isclose(pdf['area_t2_km2'], pdf['FoG_AREA_MATCH'], rtol=0.15, atol=0.4)
groups = pdf.groupby('match_area_isclose').groups

fig = plt.figure(figsize=(5, 5), dpi=200)
ax = fig.add_subplot(111)
pdf.loc[groups[False]].plot(x='area_t2_km2', y='FoG_AREA_MATCH', kind='scatter', ax=ax, color='DarkRed', label='False');
pdf.loc[groups[True]].plot(x='area_t2_km2', y='FoG_AREA_MATCH', kind='scatter', ax=ax, color='DarkBlue', label='True');
#pdf.plot(x='area_t2_km2', y='FoG_AREA_MATCH', kind='scatter', ax=ax);

In [None]:
pdf['closest_area_isclose'] =np.isclose(pdf['area_t2_km2'], pdf['FoG_AREA_CLOSEST'], rtol=0.15, atol=0.4)
groups = pdf.groupby('closest_area_isclose').groups

fig = plt.figure(figsize=(5, 5), dpi=200)
ax = fig.add_subplot(111)
pdf.loc[groups[False]].plot(x='area_t2_km2', y='FoG_AREA_CLOSEST', kind='scatter', ax=ax, color='DarkRed', label='False');
pdf.loc[groups[True]].plot(x='area_t2_km2', y='FoG_AREA_CLOSEST', kind='scatter', ax=ax, color='DarkBlue', label='True');

In [None]:
pdf.loc[(pdf.match_area_isclose == True) & (pdf.closest_area_isclose == True)][['Glacier_name_SGI2010', 'COMPNAME', 'MATCH_NAME', 'MATCH_RATIO', 'match_area_isclose', 'closest_area_isclose', 'area_t2_km2', 'FoG_AREA_MATCH', 'FoG_AREA_CLOSEST']].head(20)
pdf[['Glacier_name_SGI2010', 'COMPNAME', 'MATCH_NAME', 'MATCH_RATIO', 'match_area_isclose', 'closest_area_isclose', 'area_t2_km2', 'FoG_AREA_MATCH', 'FoG_AREA_CLOSEST']].head(20)

# Begin to establish new dataset containing the columns from the CHANGE file

In [None]:
entries_new_pdd = pd.DataFrame(columns=pdd.columns.values)
entries_new_pda = pd.DataFrame(columns=pda.columns.values)

In [None]:
for idx,cols in pdf.iterrows():
    # here comes Horst's name test: if the glacier is already present, use its ID, leave the GENERAL file as is and fill
    # in the MB data only
    # else: create new entry in GENERAL file and in CHANGE file
    
    # if   :
    
    # else:
    
    # create new ID
    gid = new_id_range[idx]
    
    # set the REMARKS (added at the end)
    REMARKS_pdd = ''
    REMARKS_pda = ''
    
    # two new DFs for the different FoG files
    glacier_pdd = pd.DataFrame([['']*len(pdd.columns.values)], columns=pdd.columns.values)
    glacier_pda = pd.DataFrame([['']*len(pda.columns.values)], columns=pda.columns.values)
    
    glacier_pdd.POLITICAL_UNIT = 'CH'
    glacier_pdd.NAME = cols.Glacier_name_SGI2010.upper()
    glacier_pdd.WGMS_ID = gid
    glacier_pdd.YEAR = cols.t2_year
    glacier_pdd.LOWER_BOUND = 9999
    glacier_pdd.UPPER_BOUND = 9999
    glacier_pdd.AREA_SURVEY_YEAR = cols.area_t2_km2
    glacier_pdd.AREA_CHANGE = (cols.area_t2_km2 - cols.area_t1_km2) * 1000  # *1000: unit difference (1000m2 and km2)
    glacier_pdd.AREA_CHANGE_UNC = np.nan
    glacier_pdd.THICKNESS_CHG = np.nan                  # can be calculated???
    glacier_pdd.THICKNESS_CHG_UNC = np.nan
    glacier_pdd.VOLUME_CHANGE = cols.dvol_mio_m3_between_t1_and_t2 *1000   # *1000: unit difference (1000m3 / 1000000m3)
    glacier_pdd.VOLUME_CHANGE_UNC = cols.uncertainty_dvol_between_t1_and_t2_mio_m3 # *1000:unit difference (1000m3 / 1000000m3)
    glacier_pdd.SURVEY_DATE = int(cols.t2_year *10000 + 9999)  # in order to fulfill the requirements (month/day unknown)
    glacier_pdd.SD_PLATFORM_METHOD =  'aC'   # must be determined
    REMARKS_pdd = REMARKS_pdd + 'Survey date method: PL.'
    glacier_pdd.REFERENCE_DATE = int(cols.t1_year *10000 + 9999)  # in order to fulfill the requirements (month/day unknown)
    glacier_pdd.RD_PLATFORM_METHOD = 'aC'   # must be determined
    REMARKS_pdd = REMARKS_pdd + ' Reference date method: PM.'
    glacier_pdd.PUB_IN_FOG = int(2016)
    glacier_pdd.INVESTIGATOR = 'Mauro Fischer, Matthias Huss, Martin Hoelzle'
    glacier_pdd.SPONS_AGENCY = 'Department of Geosciences, University of Fribourg, 1700 Fribourg, Switzerland'
    glacier_pdd.REFERENCE = 'Fischer et al. (2015); The Cryosphere, 9, 2016'
    REMARKS_pdd = REMARKS_pdd + ' ID SGI 1973: %s.' % cols.ID_SGI1973
    REMARKS_pdd = REMARKS_pdd + ' ID SGI 2010: %s.' % cols.Code_SGI2010
    # at the very end 
    glacier_pdd.REMARKS = REMARKS_pdd
    
    
    glacier_pda.POLTITICAL_UNIT = 'CH'
    glacier_pda.NAME = cols.Glacier_name_SGI2010.upper()
    glacier_pda.WGMS_ID = int(gid) 
    glacier_pda.LATITUDE = pdfll[pdfll.Glacier_name_SGI2010 == cols.Code_SGI2010]['y WGS84)'].values[0]
    glacier_pda.LONGITUDE = pdfll[pdfll.Glacier_name_SGI2010 == cols.Code_SGI2010]['Location(x WGS84)'].values[0]
    glacier_pda['REGION'] = 'Central Europe'
    glacier_pda['SUBREGION'] = 'Alps'

    
    entries_new_pdd = entries_new_pdd.append(glacier_pdd, ignore_index=True)
    entries_new_pda = entries_new_pda.append(glacier_pda, ignore_index=True)

In [None]:
entries_new_pdd[['WGMS_ID', 'YEAR', 'LOWER_BOUND', 'UPPER_BOUND', 'SURVEY_DATE', 'REFERENCE_DATE', 'PUB_IN_FOG']] = entries_new_pdd[['WGMS_ID', 'YEAR', 'LOWER_BOUND', 'UPPER_BOUND', 'SURVEY_DATE', 'REFERENCE_DATE', 'PUB_IN_FOG']].astype(int)
entries_new_pdd.head()

In [None]:
len(pdf)

In [None]:
entries_new_pda.head()
#pdfll[pdfll.Glacier_name_SGI2010 == cols.Code_SGI2010]['y WGS84)'].values[0]

In [None]:
pdfll.head()

In [None]:
cols.Code_SGI2010

In [None]:
pdfll[pdfll.Glacier_name_SGI2010 == cols.Code_SGI2010]