# Testing logratio transformations

In [3]:
%matplotlib inline
import geopandas as gpd
import pandas as pd
import numpy as np
import sys

sys.path.insert(0, "..")
from eis_toolkit.transformations.coda.alr import alr_transform, _alr_transform
from eis_toolkit.transformations.coda.clr import clr_transform, inverse_clr
from eis_toolkit.transformations.coda.ilr import single_ilr_transform
from eis_toolkit.transformations.coda.pairwise import single_pairwise_logratio
from eis_toolkit.transformations.coda.plr import plr_transform, single_plr_transform

GEOCHEMICAL_DATA = "../tests/data/local/coda/IOCG_CLB_Till_Geochem_reg_511p.shp"

In [4]:
# Change option to display all columns (the geochemical data in question has a lot of columns)

pd.set_option('display.max_columns', None)

## Testing with a simple, single-row composition

In [5]:
# A simple example composition consisiting of the parts a, b and c.
# Components are expressed in percentages, and the parts sum to 100%.
# The example is from Pawlowsky-Glahn & Egozcue 2006.

c_arr = np.array([80, 15, 5])
C = pd.DataFrame(c_arr[None], columns=["a", "b", "c"])

In [7]:
pair_a_b = single_pairwise_logratio(float(C.iloc[0, 0]), float(C.iloc[0, 1]))
pair_a_c = single_pairwise_logratio(float(C.iloc[0, 0]), float(C.iloc[0, 2]))
pair_b_c = single_pairwise_logratio(float(C.iloc[0, 1]), float(C.iloc[0, 2]))

C_alr = alr_transform(C)
C_clr = clr_transform(C)
C_plr = plr_transform(C)

C_clr_inv, _ = inverse_clr(C_clr)

C_ilr_ab = single_ilr_transform(C, ["a"], ["b"])
C_ilr_ab_c = single_ilr_transform(C, ["a", "b"], ["c"])

In [9]:
# Expected result: 1.67
pair_a_b

1.6739764335716716

In [10]:
# Expected result: 2.77
pair_a_c

2.772588722239781

In [11]:
# Expected result: 1.10
pair_b_c

1.0986122886681098

In [12]:
# Expected result: [2.77, 1.10]
C_alr

Unnamed: 0,a,b
0,2.772589,1.098612


In [13]:
# Expected result: [1.48, -0.19, -1.29]
C_clr

Unnamed: 0,a,b,c
0,1.482188,-0.191788,-1.2904


In [14]:
# Expected result: [1.82, 0.78]
C_plr

Unnamed: 0,a,b
0,1.815303,0.776836


In [15]:
# Renormalized to 100%, CLR inverse should show the original data: [80, 15, 5]
C_clr_inv * 100

Unnamed: 0,a,b,c
0,80.0,15.0,5.0


In [16]:
# Expected result: 1.18
C_ilr_ab

0    1.18368
dtype: float64

In [17]:
# Expected result: 1.58
C_ilr_ab_c

0    1.580411
dtype: float64

### Testing with example data

In [18]:
# Define some constants

ppm = 1e-6
million = 1e6

In [19]:
# Names of all the columns that contain concentration data

elements = ['Al_ppm_511', 'Ba_ppm_511', 'Ca_ppm_511', 'Co_ppm_511', 'Cr_ppm_511', 'Cu_ppm_511', 'Fe_ppm_511', 'K_ppm_511P', 'La_ppm_511', 'Li_ppm_511', 'Mg_ppm_511', 'Mn_ppm_511', 'Ni_ppm_511', 'P_ppm_511P', 'Sc_ppm_511', 'Sr_ppm_511', 'Th_ppm_511', 'Ti_ppm_511', 'V_ppm_511P', 'Y_ppm_511P', 'Zn_ppm_511', 'Zr_ppm_511']

In [20]:
# A subcomposition (selected at random)

elements_to_analyze = ['Al_ppm_511', 'Ca_ppm_511', 'Fe_ppm_511', 'Mg_ppm_511']

In [21]:
df_all = gpd.read_file(GEOCHEMICAL_DATA)
#df_all = pd.DataFrame(df.drop(columns='geometry'))
df_all.head()

Unnamed: 0,OBJECTID,Year,Observatio,Northing,Easting,Northing_k,Easting_kk,Map_sheet_,Map_sheet1,Material,Soil_horiz,Sampler,Parallel_s,Combined_s,Fraction,Second_ana,Analysis_d,Al_ppm_511,Al_511P_T,Ba_ppm_511,Ba_511P_T,Ca_ppm_511,Ca_511P_T,Co_ppm_511,Co_511P_T,Cr_ppm_511,Cr_511P_T,Cu_ppm_511,Cu_511P_T,Fe_ppm_511,Fe_511P_T,K_ppm_511P,K_511P_T,La_ppm_511,La_511P_T,Li_ppm_511,Li_511P_T,Mg_ppm_511,Mg_511P_T,Mn_ppm_511,Mn_511P_T,Ni_ppm_511,Ni_511P_T,P_ppm_511P,P_511P_T,Sc_ppm_511,Sc_511P_T,Sr_ppm_511,Sr_511P_T,Th_ppm_511,Th_511P_T,Ti_ppm_511,Ti_511P_T,V_ppm_511P,V_511P_T,Y_ppm_511P,Y_511P_T,Zn_ppm_511,Zn_511P_T,Zr_ppm_511,Zr_511P_T,geometry
0,2259,81,49114,7526963,404203,7530108,3404337,2741,10,till,C,percussion drilling/Cobra,,Y,"sieved, <0.06 mm (fine)",,1986-07-01,27600.0,,20.6,,40200.0,,69.7,,73.4,,100.0,,83200.0,,664.0,,11.3,,14.3,,17200.0,,2250.0,,61.4,,735.0,,24.3,,22.6,,2.97,,2590.0,,354.0,,12.2,,86.7,,0.0,<,POINT (404203.131 7526963.173)
1,3328,81,49282,7530498,406827,7533644,3406962,2741,11,till,C,percussion drilling/Cobra,,Y,"sieved, <0.06 mm (fine)",,1986-07-24,14100.0,,22.6,,5000.0,,20.5,,41.0,,60.8,,28300.0,,405.0,,10.8,,8.16,,7520.0,,370.0,,34.2,,566.0,,5.7,,12.0,,1.74,,2640.0,,76.9,,9.0,,58.5,,0.0,<,POINT (406827.101 7530497.757)
2,3507,81,49269,7536878,406536,7540027,3406671,2741,11,till,C,percussion drilling/Cobra,,Y,"sieved, <0.06 mm (fine)",,1986-06-24,7880.0,,22.9,,3070.0,,6.94,,33.0,,24.7,,14500.0,,710.0,,11.7,,7.58,,4540.0,,152.0,,17.9,,458.0,,3.12,,8.36,,5.44,,1470.0,,46.3,,6.9,,28.7,,12.3,,POINT (406536.240 7536878.222)
3,4936,81,49245,7524138,401544,7527282,3401677,2741,7,till,C,percussion drilling/Cobra,,Y,"sieved, <0.06 mm (fine)",,1986-06-12,7300.0,,25.1,,3290.0,,8.38,,25.0,,28.4,,14600.0,,836.0,,10.6,,5.62,,3240.0,,156.0,,13.8,,744.0,,2.28,,6.9,,4.42,,1050.0,,42.2,,5.03,,16.7,,7.9,,POINT (401544.178 7524138.307)
4,4937,81,49283,7530830,405584,7533976,3405718,2741,11,till,C,percussion drilling/Cobra,,Y,"sieved, <0.06 mm (fine)",,1986-07-10,12500.0,,25.1,,3600.0,,29.3,,38.9,,88.7,,31500.0,,1260.0,,10.9,,6.66,,8020.0,,392.0,,36.4,,592.0,,4.17,,8.25,,2.58,,1530.0,,69.7,,7.74,,60.6,,0.0,<,POINT (405583.597 7530829.630)


In [22]:
# (Testing)
# A function for checking if zeros should be considered to be rounded

def contains_values_below_detection_limit(series):
    return "<" in series.values

contains_values_below_detection_limit(df_all["Zr_511P_T"])
contains_values_below_detection_limit(df_all["Y_511P_T"])

False

In [23]:
# Read the vector file into a dataframe

df = gpd.read_file(GEOCHEMICAL_DATA, include_fields=elements)
df = pd.DataFrame(df.drop(columns='geometry'))

# TODO: add a column for the residual

df["sum"] = np.nansum(df)
print(df["sum"][0])

#df.dropna()
df.head()

#df.iloc[0,:]

60591619.90400001


Unnamed: 0,Al_ppm_511,Ba_ppm_511,Ca_ppm_511,Co_ppm_511,Cr_ppm_511,Cu_ppm_511,Fe_ppm_511,K_ppm_511P,La_ppm_511,Li_ppm_511,Mg_ppm_511,Mn_ppm_511,Ni_ppm_511,P_ppm_511P,Sc_ppm_511,Sr_ppm_511,Th_ppm_511,Ti_ppm_511,V_ppm_511P,Y_ppm_511P,Zn_ppm_511,Zr_ppm_511,sum
0,27600.0,20.6,40200.0,69.7,73.4,100.0,83200.0,664.0,11.3,14.3,17200.0,2250.0,61.4,735.0,24.3,22.6,2.97,2590.0,354.0,12.2,86.7,0.0,60591620.0
1,14100.0,22.6,5000.0,20.5,41.0,60.8,28300.0,405.0,10.8,8.16,7520.0,370.0,34.2,566.0,5.7,12.0,1.74,2640.0,76.9,9.0,58.5,0.0,60591620.0
2,7880.0,22.9,3070.0,6.94,33.0,24.7,14500.0,710.0,11.7,7.58,4540.0,152.0,17.9,458.0,3.12,8.36,5.44,1470.0,46.3,6.9,28.7,12.3,60591620.0
3,7300.0,25.1,3290.0,8.38,25.0,28.4,14600.0,836.0,10.6,5.62,3240.0,156.0,13.8,744.0,2.28,6.9,4.42,1050.0,42.2,5.03,16.7,7.9,60591620.0
4,12500.0,25.1,3600.0,29.3,38.9,88.7,31500.0,1260.0,10.9,6.66,8020.0,392.0,36.4,592.0,4.17,8.25,2.58,1530.0,69.7,7.74,60.6,0.0,60591620.0


In [24]:
# Check if the sum of each composition is the same

def all_values_same(series):
    arr = series.to_numpy()
    return (arr[0] == arr).all()

all_values_same(df["sum"])

True

In [25]:
df.describe().loc["mean",:]

Al_ppm_511    1.379913e+04
Ba_ppm_511    7.498031e+01
Ca_ppm_511    3.700737e+03
Co_ppm_511    1.226708e+01
Cr_ppm_511    5.402185e+01
Cu_ppm_511    4.358362e+01
Fe_ppm_511    2.484711e+04
K_ppm_511P    2.198773e+03
La_ppm_511    2.629311e+01
Li_ppm_511    1.198001e+01
Mg_ppm_511    6.560192e+03
Mn_ppm_511    2.628523e+02
Ni_ppm_511    3.002676e+01
P_ppm_511P    8.609685e+02
Sc_ppm_511    3.952198e+00
Sr_ppm_511    1.316853e+01
Th_ppm_511    1.011247e+01
Ti_ppm_511    1.859682e+03
V_ppm_511P    6.610621e+01
Y_ppm_511P    1.099033e+01
Zn_ppm_511    2.998996e+01
Zr_ppm_511    1.194834e+01
sum           6.059162e+07
Name: mean, dtype: float64

In [26]:
np.sum(df.describe().loc["mean",:])

60646108.77082014