# Fix SUMup errors

#### Author: Megan Thompson-Munson
#### Date created: 24 September 2021

In [1]:
import numpy as np
import cartopy.crs as ccrs
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import pickle

## 1. Open dataset and create dataframe of raw data

In [2]:
sumup = xr.open_dataset('sumup_density_2020_v060121.nc')

In [3]:
# Extract data and remove no data
su_elev = sumup['Elevation'].values
su_lat = sumup['Latitude'].values

# Ignore any sea ice or erroneous data
condition = (su_elev>0) & (su_lat>-91) & (su_lat<91)

# Exctract data based on condition
su_lon = sumup['Longitude'].values[condition]
su_depth0 = sumup['Start_Depth'].values[condition]
su_depth1 = sumup['Stop_Depth'].values[condition]
su_midpoint = sumup['Midpoint'].values[condition]
su_density = sumup['Density'].values[condition]
su_citation = sumup['Citation'].values[condition]
su_date = sumup['Date'].values[condition]
su_method = sumup['Method'].values[condition]
su_elev = su_elev[condition]
su_lat = su_lat[condition]

su_data = {'Citation':su_citation,'Timestamp':su_date,'Latitude':su_lat,'Longitude':su_lon,'Elevation':su_elev,
           'Midpoint':su_midpoint,'StartDepth':su_depth0,'StopDepth':su_depth1,'Density':su_density,'Method':su_method}

df = pd.DataFrame(data=su_data)
df

Unnamed: 0,Citation,Timestamp,Latitude,Longitude,Elevation,Midpoint,StartDepth,StopDepth,Density,Method
0,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.050,0.00,0.10,0.3680,1.0
1,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.150,0.10,0.20,0.3810,1.0
2,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.250,0.20,0.30,0.3680,1.0
3,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.350,0.30,0.40,0.3630,1.0
4,3.0,20111228.0,-79.446800,-117.963501,1619.0,0.450,0.40,0.50,0.3890,1.0
...,...,...,...,...,...,...,...,...,...,...
2105887,187.0,20130604.0,72.579781,-38.458630,3210.0,0.765,0.75,0.78,0.2740,3.0
2105888,187.0,20130604.0,72.579781,-38.458630,3210.0,0.795,0.78,0.81,0.2977,3.0
2105889,187.0,20130604.0,72.579781,-38.458630,3210.0,0.825,0.81,0.84,0.3080,3.0
2105890,187.0,20130604.0,72.579781,-38.458630,3210.0,0.855,0.84,0.87,0.3056,3.0


## 2. Fix issues in dataset

### 2.1. Fix date issues

Some dates are just years (e.g., '19990000'), some have a day of '32' listed, and a few are just plain incorrect.

In [None]:
su_timestamp = []
for i in range(len(su_date)):
    d = su_date[i]
    date_str = str(d)
    
    # These particular dates appear to be very incorrect
    if date_str == '19999000.0':
        date_str = '19990000.0'
    if date_str == '20089620.0':
        date_str = '20080620.0'
    
    year = date_str[0:4]
    month = date_str[4:6]
    day = date_str[6:8]
    
    # Add Jan 1 to year-only dates, and change any with 32 days to 31 days
    if month == '00':
        month = '01'
    if day == '00':
        day = '01'
    if day == '32':
        day = '31'
    
    d = float(year+month+day)
    su_timestamp.append(d)

su_timestamp = np.array(su_timestamp)

df['Timestamp'] = su_timestamp
df

### 2.2. Add core IDs

In [None]:
# Create a unique index for each core
n = -1
id0 = []
for i in range(len(su_citation)-1):
    if (su_citation[i]==su_citation[i-1] and su_lat[i]==su_lat[i-1] and su_lon[i]==su_lon[i-1] and su_method[i]==su_method[i-1] and su_timestamp[i]==su_timestamp[i-1]):
        index = n
    else:
        n += 1
        index = n
    id0.append(index)
id0.append(id0[-1])

# Give each datapoint within a core index its own index
m = -1
id1 = []
for i in range(len(id0)-1):
    if id0[i] == id0[i-1]:
        m += 1
    else:
        m = 0
    id1.append(m)
id1.append(id1[-1]+1)

# Set indices in dataframe
df['CoreID'] = id0
df['CoreIdx'] = id1

df

### 2.3. Create list of dataframes

In [None]:
# Split dataframe into list of dataframes
dfs = []

for i in range(len(np.unique(df.CoreID))):
    
    dftemp = df[df.CoreID==i]
    dftemp = dftemp.reset_index()
    dftemp = dftemp.drop(columns=['index','CoreIdx'])
    dfs.append(dftemp)

### 2.4. Fix thickness, midpoint, and density

#### 2.4.1. Calculate thickness and midpoint

In [None]:
for i in range(len(dfs)):
    
    dftemp = dfs[i]
    
    thicknessD = np.array(dftemp.StopDepth) - np.array(dftemp.StartDepth)
    
    midpoint = np.array(dftemp.Midpoint)
    thicknessM = midpoint[1:]-midpoint[:-1]
    thicknessM = np.insert(thicknessM,0,midpoint[0])
    
    if thicknessD[0] == 0:
        dftemp['Thickness'] = np.array(thicknessM)
    else:
        dftemp['Thickness'] = np.array(thicknessD)
    
    if midpoint[0] == -9999:
        midpointcalc = (np.array(dftemp.StopDepth) + np.array(dftemp.StartDepth)) / 2
        dftemp['Midpoint'] = midpointcalc

#### 2.4.2. Remove incorrect thickness values

In [52]:
# Remove any data point with a negative thickness or where thickness is incorrect
for i in range(len(dfs)):
    
    # First line of this entry is incorrect
    if i == 533:
        dfs[i] = dfs[i][1:]
    
    dfs[i] = dfs[i][dfs[i].Thickness>0]
    dfs[i] = dfs[i].reset_index(drop=True)
    
    dfs[i] = dfs[i][['CoreID','Citation','Method','Timestamp','Latitude','Longitude','Elevation',
                    'Midpoint','StartDepth','StopDepth','Thickness','Density']]

#### 2.4.3. Fix units in density

In [54]:
# Find where density is greater than 1 (i.e., where the units are wrong)
error_density = []

for i in range(len(dfs)):
    
    density = dfs[i].Density
    
    for j in range(len(density)):
        
        if density[j] > 1:
            
            error_density.append(i)
            
error_density = np.unique(np.array(error_density))

# Replace incorrect density units with correct units
for i in range(len(error_density)):
    
    df = dfs[error_density[i]]
    
    density_kgm = df.Density
    
    density_gcm = density_kgm/1000
    
    df['Density'] = density_gcm

## 3. Create and save dictionaries

### 3.1. Create dictionaries for each ice sheet

In [56]:
AISdicts = []
GrISdicts = []

for i in range(len(dfs)):
    
    df = dfs[i]
    
    # Metadata
    latitude = np.array(df.Latitude)[0]
    longitude = np.array(df.Longitude)[0]
    elevation = np.array(df.Elevation)[0]
    timestamp = np.array(pd.to_datetime(df.Timestamp,format='%Y%m%d'))[0]
    citation = np.array(df.Citation)[0]
    coreid = np.array(df.CoreID)[0]
    method = np.array(df.Method)[0]
    
    # Data
    midpoint = np.array(df.Midpoint)
    startdepth = np.array(df.StartDepth)
    stopdepth = np.array(df.StopDepth)
    thickness = np.array(df.Thickness)
    density = np.array(df.Density)

    dictionary = {'CoreID':coreid,'Citation':citation,'Method':method,'Latitude':latitude,'Longitude':longitude,'Elevation':elevation,'Timestamp':timestamp,
                  'Midpoint':midpoint,'StartDepth':startdepth,'StopDepth':stopdepth,'Thickness':thickness,'Density':density*1000}
    
    # AIS
    if latitude < 0:
        AISdicts.append(dictionary)
    
    # GrIS
    if latitude > 0:
        GrISdicts.append(dictionary)

### 3.2. Pickle the dictionaries

In [57]:
pickle.dump(AISdicts, open('AIS_SUMup.p','wb'))
pickle.dump(GrISdicts, open('GrIS_SUMup.p','wb'))