# Modules

In [23]:
import pandas as pd
import numpy as np
import unidecode
from difflib import SequenceMatcher
import salem

# File paths

In [64]:
f_path = 'C:\\Users\\jlandman\\Desktop\\database_Fischer_et_al._2015_The_Cryosphere.txt'                 # Fischer database with swiss coordinates 
fll_path = 'C:\\Users\\jlandman\\Desktop\\SGI2010wgs84_shapefiles\\parameters_SGI2010.csv'               # Fischer database with lat/lon
a_path = 'C:\\Users\\jlandman\\Desktop\\DOI-WGMS-FoG-2015-11\\WGMS-FoG-2015-11-A-GENERAL-INFORMATION.csv'# FoG: A GENERAL 
d_path = 'C:\\Users\\jlandman\\Desktop\\DOI-WGMS-FoG-2015-11\\WGMS-FoG-2015-11-D-CHANGE.csv'             # FoG: D CHANGE

# Read files

# Preselect FoG IDs in Switzerland

In [105]:
pda = pda[pda.POLITICAL_UNIT == 'CH']

# Haversine function

In [106]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between one point 
    on the earth and an array of points (specified in decimal degrees)
    """
    
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in meters
    return c * r

# Some settings

In [107]:
begin_new_id = 7000  # where to start with assigning new WGMS IDs
new_id_range = range(begin_new_id, begin_new_id+len(pdf)+1,1)
new_id_range

range(7000, 8386)

# Correct missing underscores and column names

In [108]:
pdf = pdf.rename(columns={'uncertainty_dvol_between_t1_and_t2_mio m3': 'uncertainty_dvol_between_t1_and_t2_mio_m3'})
pdfll = pdfll.rename(columns={'uncertainty_dvol_between_t1_and_t2_mio m3': 'uncertainty_dvol_between_t1_and_t2_mio_m3'})

pdfll = pdfll.rename(columns={'Unnamed: 15': 'Glacier_name_SGI2010'})

# Take over the FoG ID for all Fischer/PDA duplicates => compare names and location

In [109]:
# dictionary of strings and symbols that need to be replaced prior to calculation of similarity score
name_adjust = {'gletscher':'', 'glacier':'' , 'vadret':'', 'ghiacciaio':'', 'vedretta':'', 'ferner':'', 'ä':'ae', 'ö':'oe', 'ü':'ue', 'é':'e', 'à':'a', 'è':'e', 'ô':'o', 'ß':'ss'} 

In [110]:
# introduce FoG column with corrected names
pda['FoG_compname'] = ''

for initname in [n for n in pda.NAME.values]:
    name = initname.lower()
    for key in name_adjust:
        if key in name:
            name = name.replace(key, name_adjust[key])
    #pda.FoG_compname[pda.NAME == initname] = name
    pda.loc[pda.NAME == initname, 'FoG_compname'] = name

In [129]:
pdf.COMPNAME = ''
pdf.MATCH_RATIO = np.nan
pdf.MATCH_NAME = ''
pdf.CLOSEST = ''
pdf.DIST_TO_CLOSEST = np.nan

# adjust Mauro's names
for idx,cols in pdf.iterrows():
    # simplify name
    compname_mau = cols.Glacier_name_SGI2010.lower()
    for key in name_adjust:
        if key in compname_mau:
            compname_mau = compname_mau.replace(key, name_adjust[key])
            
    pdf.loc[pdf.Glacier_name_SGI2010 == cols.Glacier_name_SGI2010, 'COMPNAME'] = compname_mau
    
    ratio = 0.0
    name = ''
    for cname in pda.FoG_compname.values:
        curr_ratio = SequenceMatcher(None, compname_mau, cname).ratio()
        
        if curr_ratio > ratio:
            ratio = curr_ratio
            name = cname
            
    pdf.loc[pdf.Glacier_name_SGI2010 == cols.Glacier_name_SGI2010, 'MATCH_RATIO'] = ratio
    pdf.loc[pdf.Glacier_name_SGI2010 == cols.Glacier_name_SGI2010, 'MATCH_NAME'] = name
    
    lat = pdfll[pdfll.Glacier_name_SGI2010 == cols.Code_SGI2010]['y WGS84)'].values[0]
    lon = pdfll[pdfll.Glacier_name_SGI2010 == cols.Code_SGI2010]['Location(x WGS84)'].values[0]
    
    dist = np.nan
    close_name = ''
    for pda_idx, pda_cols in pda.iterrows():
        curr_dist = haversine(lon, lat, pda_cols.LONGITUDE, pda_cols.LATITUDE)
        
        if curr_dist < dist or pd.isnull(dist):
            dist = curr_dist
            close_name = pda_cols.NAME
            
    pdf.loc[pdf.Glacier_name_SGI2010 == cols.Glacier_name_SGI2010, 'DIST_TO_CLOSEST'] = dist
    pdf.loc[pdf.Glacier_name_SGI2010 == cols.Glacier_name_SGI2010, 'CLOSEST'] = close_name

In [130]:
pdf[['Glacier_name_SGI2010', 'COMPNAME', 'MATCH_RATIO', 'MATCH_NAME', 'DIST_TO_CLOSEST', 'CLOSEST']].head(50)

Unnamed: 0,Glacier_name_SGI2010,COMPNAME,MATCH_RATIO,MATCH_NAME,DIST_TO_CLOSEST,CLOSEST
0,Leidhorn*,leidhorn*,0.666667,breithorn,6756.075703,VERSTANKLA
1,SEEGLETSCHER,see,0.6,tseudet,6613.557695,SILVRETTA
2,SILVRETTAGLETSCHER (Nr. 90),silvretta (nr. 90),0.666667,silvretta,760.841538,SILVRETTA
3,CHAMMGLETSCHER (Teilgl. von A10G/08),chamm (teilgl. von a10g/08),0.457143,cheillon,757.002576,SILVRETTA
4,Chlein Wintertaelli*,chlein wintertaelli*,0.47619,laemmern (wildstrubel),1382.392239,VERSTANKLA
5,ROGGEN GLETSCHER,roggen,0.5,rhone,4341.712935,VERSTANKLA
6,VERNELA GLETSCHER,vernela,0.666667,verstankla,650.965735,VERSTANKLA
7,ZADRELL GLETSCHER,zadrell,0.5,arolla (bas),2279.548915,TIATSCHA
8,Plattenhoerner*,plattenhoerner*,0.482759,wannenhorn gl.,3182.350557,VERSTANKLA
9,JÖRIGLETSCHER,joeri,0.545455,bodmer,7642.41257,SCALETTA


# Begin to establish new dataset containing the columns from the CHANGE file

In [30]:
entries_new_pdd = pd.DataFrame(columns=pdd.columns.values)
entries_new_pda = pd.DataFrame(columns=pda.columns.values)

In [31]:
for idx,cols in pdf.iterrows():
    # here comes Horst's name test: if the glacier is already present, use its ID, leave the GENERAL file as is and fill
    # in the MB data only
    # else: create new entry in GENERAL file and in CHANGE file
    
    # if   :
    
    # else:
    
    # create new ID
    gid = new_id_range[idx]
    
    # set the REMARKS (added at the end)
    REMARKS_pdd = ''
    REMARKS_pda = ''
    
    # two new DFs for the different FoG files
    glacier_pdd = pd.DataFrame([['']*len(pdd.columns.values)], columns=pdd.columns.values)
    glacier_pda = pd.DataFrame([['']*len(pda.columns.values)], columns=pda.columns.values)
    
    glacier_pdd.POLITICAL_UNIT = 'CH'
    glacier_pdd.NAME = cols.Glacier_name_SGI2010.upper()
    glacier_pdd.WGMS_ID = gid
    glacier_pdd.YEAR = cols.t2_year
    glacier_pdd.LOWER_BOUND = 9999
    glacier_pdd.UPPER_BOUND = 9999
    glacier_pdd.AREA_SURVEY_YEAR = cols.area_t2_km2
    glacier_pdd.AREA_CHANGE = (cols.area_t2_km2 - cols.area_t1_km2) * 1000  # *1000: unit difference (1000m2 and km2)
    glacier_pdd.AREA_CHANGE_UNC = np.nan
    glacier_pdd.THICKNESS_CHG = np.nan                  # can be calculated???
    glacier_pdd.THICKNESS_CHG_UNC = np.nan
    glacier_pdd.VOLUME_CHANGE = cols.dvol_mio_m3_between_t1_and_t2 *1000   # *1000: unit difference (1000m3 / 1000000m3)
    glacier_pdd.VOLUME_CHANGE_UNC = cols.uncertainty_dvol_between_t1_and_t2_mio_m3 # *1000:unit difference (1000m3 / 1000000m3)
    glacier_pdd.SURVEY_DATE = int(cols.t2_year *10000 + 9999)  # in order to fulfill the requirements (month/day unknown)
    glacier_pdd.SD_PLATFORM_METHOD =  'aC'   # must be determined
    REMARKS_pdd = REMARKS_pdd + 'Survey date method: PL.'
    glacier_pdd.REFERENCE_DATE = int(cols.t1_year *10000 + 9999)  # in order to fulfill the requirements (month/day unknown)
    glacier_pdd.RD_PLATFORM_METHOD = 'aC'   # must be determined
    REMARKS_pdd = REMARKS_pdd + ' Reference date method: PM.'
    glacier_pdd.PUB_IN_FOG = int(2016)
    glacier_pdd.INVESTIGATOR = 'Mauro Fischer, Matthias Huss, Martin Hoelzle'
    glacier_pdd.SPONS_AGENCY = 'Department of Geosciences, University of Fribourg, 1700 Fribourg, Switzerland'
    glacier_pdd.REFERENCE = 'Fischer et al. (2015); The Cryosphere, 9, 2016'
    REMARKS_pdd = REMARKS_pdd + ' ID SGI 1973: %s.' % cols.ID_SGI1973
    REMARKS_pdd = REMARKS_pdd + ' ID SGI 2010: %s.' % cols.Code_SGI2010
    # at the very end 
    glacier_pdd.REMARKS = REMARKS_pdd
    
    
    glacier_pda.POLTITICAL_UNIT = 'CH'
    glacier_pda.NAME = cols.Glacier_name_SGI2010.upper()
    glacier_pda.WGMS_ID = int(gid) 
    glacier_pda.LATITUDE = pdfll[pdfll.Glacier_name_SGI2010 == cols.Code_SGI2010]['y WGS84)'].values[0]
    glacier_pda.LONGITUDE = pdfll[pdfll.Glacier_name_SGI2010 == cols.Code_SGI2010]['Location(x WGS84)'].values[0]
    glacier_pda['REGION'] = 'Central Europe'
    glacier_pda['SUBREGION'] = 'Alps'

    
    entries_new_pdd = entries_new_pdd.append(glacier_pdd, ignore_index=True)
    entries_new_pda = entries_new_pda.append(glacier_pda, ignore_index=True)

In [33]:
entries_new_pdd[['WGMS_ID', 'YEAR', 'LOWER_BOUND', 'UPPER_BOUND', 'SURVEY_DATE', 'REFERENCE_DATE', 'PUB_IN_FOG']] = entries_new_pdd[['WGMS_ID', 'YEAR', 'LOWER_BOUND', 'UPPER_BOUND', 'SURVEY_DATE', 'REFERENCE_DATE', 'PUB_IN_FOG']].astype(int)
entries_new_pdd.head()

Unnamed: 0,POLITICAL_UNIT,NAME,WGMS_ID,YEAR,LOWER_BOUND,UPPER_BOUND,AREA_SURVEY_YEAR,AREA_CHANGE,AREA_CHANGE_UNC,THICKNESS_CHG,...,VOLUME_CHANGE_UNC,SURVEY_DATE,SD_PLATFORM_METHOD,REFERENCE_DATE,RD_PLATFORM_METHOD,PUB_IN_FOG,INVESTIGATOR,SPONS_AGENCY,REFERENCE,REMARKS
0,CH,LEIDHORN*,7000,2009,9999,9999,0.0081,-70.7,,,...,0.4166,20099999,aC,19739999,aC,2016,"Mauro Fischer, Matthias Huss, Martin Hoelzle","Department of Geosciences, University of Fribo...","Fischer et al. (2015); The Cryosphere, 9, 2016",Survey date method: PL. Reference date method:...
1,CH,SEEGLETSCHER,7001,2009,9999,9999,0.2581,-332.5,,,...,0.9841,20099999,aC,19919999,aC,2016,"Mauro Fischer, Matthias Huss, Martin Hoelzle","Department of Geosciences, University of Fribo...","Fischer et al. (2015); The Cryosphere, 9, 2016",Survey date method: PL. Reference date method:...
2,CH,SILVRETTAGLETSCHER (NR. 90),7002,2008,9999,9999,2.665,-585.0,,,...,3.0078,20089999,aC,19859999,aC,2016,"Mauro Fischer, Matthias Huss, Martin Hoelzle","Department of Geosciences, University of Fribo...","Fischer et al. (2015); The Cryosphere, 9, 2016",Survey date method: PL. Reference date method:...
3,CH,CHAMMGLETSCHER (TEILGL. VON A10G/08),7003,2008,9999,9999,0.1281,-83.8,,,...,1.5588,20089999,aC,19859999,aC,2016,"Mauro Fischer, Matthias Huss, Martin Hoelzle","Department of Geosciences, University of Fribo...","Fischer et al. (2015); The Cryosphere, 9, 2016",Survey date method: PL. Reference date method:...
4,CH,CHLEIN WINTERTAELLI*,7004,2008,9999,9999,0.0694,-38.7,,,...,0.6192,20089999,aC,19629999,aC,2016,"Mauro Fischer, Matthias Huss, Martin Hoelzle","Department of Geosciences, University of Fribo...","Fischer et al. (2015); The Cryosphere, 9, 2016",Survey date method: PL. Reference date method:...


In [20]:
len(pdf)

1385

In [35]:
entries_new_pda.head()
#pdfll[pdfll.Glacier_name_SGI2010 == cols.Code_SGI2010]['y WGS84)'].values[0]

Unnamed: 0,EXPOS_ABL_AREA,EXPOS_ACC_AREA,FORM,FREE_POSITION,FRONTAL_CHARS,GEN_LOCATION,GEO-REGION_CODE,GEO-SUBREGION_CODE,LATITUDE,LOCAL_CODE,...,NAME,PARENT_GLACIER,POLITICAL_UNIT,PRIM_CLASSIFIC,REGION,REMARKS,RIVER_BASIN,SPEC_LOCATION,SUBREGION,WGMS_ID
0,,,,,,,,,46.881199,,...,LEIDHORN*,,,,Central Europe,,,,Alps,7000.0
1,,,,,,,,,46.8932,,...,SEEGLETSCHER,,,,Central Europe,,,,Alps,7001.0
2,,,,,,,,,46.854198,,...,SILVRETTAGLETSCHER (NR. 90),,,,Central Europe,,,,Alps,7002.0
3,,,,,,,,,46.843201,,...,CHAMMGLETSCHER (TEILGL. VON A10G/08),,,,Central Europe,,,,Alps,7003.0
4,,,,,,,,,46.836899,,...,CHLEIN WINTERTAELLI*,,,,Central Europe,,,,Alps,7004.0


In [14]:
pdfll.head()

Unnamed: 0,Location(x WGS84),y WGS84),Glacier_name_SGI2010
0,9.99469,46.881199,A10F/01
1,10.0202,46.8932,A10G/02
2,10.0721,46.854198,A10G/05
3,10.0795,46.843201,A10G/07
4,10.0424,46.836899,A10G/09


In [15]:
cols.Code_SGI2010

'E73/08n'

In [16]:
pdfll[pdfll.Glacier_name_SGI2010 == cols.Code_SGI2010]

Unnamed: 0,Location(x WGS84),y WGS84),Glacier_name_SGI2010
1418,10.2343,46.895302,E73/08n
