# Testing CoDA transformations in package pyrolite

In [210]:
%matplotlib inline
import geopandas as gpd
import pandas as pd

## Add test data

In [211]:
example_file = "../tests/data/local/coda/IOCG_CLB_Till_Geochem_reg_511p.shp"
full_data = gpd.read_file(example_file)

In [212]:
full_data.columns

Index(['OBJECTID', 'Year', 'Observatio', 'Northing', 'Easting', 'Northing_k',
       'Easting_kk', 'Map_sheet_', 'Map_sheet1', 'Material', 'Soil_horiz',
       'Sampler', 'Parallel_s', 'Combined_s', 'Fraction', 'Second_ana',
       'Analysis_d', 'Al_ppm_511', 'Al_511P_T', 'Ba_ppm_511', 'Ba_511P_T',
       'Ca_ppm_511', 'Ca_511P_T', 'Co_ppm_511', 'Co_511P_T', 'Cr_ppm_511',
       'Cr_511P_T', 'Cu_ppm_511', 'Cu_511P_T', 'Fe_ppm_511', 'Fe_511P_T',
       'K_ppm_511P', 'K_511P_T', 'La_ppm_511', 'La_511P_T', 'Li_ppm_511',
       'Li_511P_T', 'Mg_ppm_511', 'Mg_511P_T', 'Mn_ppm_511', 'Mn_511P_T',
       'Ni_ppm_511', 'Ni_511P_T', 'P_ppm_511P', 'P_511P_T', 'Sc_ppm_511',
       'Sc_511P_T', 'Sr_ppm_511', 'Sr_511P_T', 'Th_ppm_511', 'Th_511P_T',
       'Ti_ppm_511', 'Ti_511P_T', 'V_ppm_511P', 'V_511P_T', 'Y_ppm_511P',
       'Y_511P_T', 'Zn_ppm_511', 'Zn_511P_T', 'Zr_ppm_511', 'Zr_511P_T',
       'geometry'],
      dtype='object')

In [213]:
elements = ['Al_ppm_511', 'Ba_ppm_511', 'Ca_ppm_511', 'Co_ppm_511', 'Cr_ppm_511', 'Cu_ppm_511', 'Fe_ppm_511', 'K_ppm_511P', 'La_ppm_511', 'Li_ppm_511', 'Mg_ppm_511', 'Mn_ppm_511', 'Ni_ppm_511', 'P_ppm_511P', 'Sc_ppm_511', 'Sr_ppm_511', 'Th_ppm_511', 'Ti_ppm_511', 'V_ppm_511P', 'Y_ppm_511P', 'Zn_ppm_511', 'Zr_ppm_511']

Select only a few columns for analysis:

In [214]:
interesting_elements = ['Al_ppm_511', 'Ca_ppm_511', 'Mg_ppm_511', 'Zr_ppm_511']

In [215]:
compositional_data = gpd.read_file(example_file, include_fields=interesting_elements)
compositional_data = pd.DataFrame(compositional_data.drop(columns='geometry'))
compositional_data.dropna()

Unnamed: 0,Al_ppm_511,Ca_ppm_511,Mg_ppm_511,Zr_ppm_511
0,27600.0,40200.0,17200.0,0.0
1,14100.0,5000.0,7520.0,0.0
2,7880.0,3070.0,4540.0,12.3
3,7300.0,3290.0,3240.0,7.9
4,12500.0,3600.0,8020.0,0.0
...,...,...,...,...
1107,18600.0,11600.0,8790.0,20.9
1108,33400.0,2390.0,18500.0,28.9
1109,38300.0,4070.0,27400.0,0.0
1110,32600.0,8630.0,18700.0,39.0


Remove zero values, since logratio transformations cannot be performed if the data contains zeros.

In [216]:
compositional_data = compositional_data.loc[(compositional_data != 0).all(axis = 1)]

In [217]:
compositional_data.head()

Unnamed: 0,Al_ppm_511,Ca_ppm_511,Mg_ppm_511,Zr_ppm_511
2,7880.0,3070.0,4540.0,12.3
3,7300.0,3290.0,3240.0,7.9
5,9470.0,624.0,960.0,0.102
6,7560.0,3160.0,4020.0,12.8
7,8000.0,3510.0,4490.0,31.9


In [218]:
compositional_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, 2 to 1110
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Al_ppm_511  930 non-null    float64
 1   Ca_ppm_511  930 non-null    float64
 2   Mg_ppm_511  930 non-null    float64
 3   Zr_ppm_511  930 non-null    float64
dtypes: float64(4)
memory usage: 36.3 KB


#### Normalize the data

Normalize the data (optional, as any subcomposition passed to the transformations will normalize them to 1):

In [219]:
data_normalized = compositional_data.pyrocomp.renormalise(components=interesting_elements, scale=1)

In [220]:
data_normalized.head()

Unnamed: 0,Al_ppm_511,Ca_ppm_511,Mg_ppm_511,Zr_ppm_511
2,0.508312,0.198035,0.29286,0.000793
3,0.527537,0.237753,0.23414,0.000571
5,0.856696,0.05645,0.086846,9e-06
6,0.512445,0.214197,0.272491,0.000868
7,0.499005,0.218938,0.280067,0.00199


In [221]:
data_normalized_2 = full_data.loc[(full_data != 0).all(axis = 1)]
data_normalized_2 = full_data.pyrocomp.renormalise(components=interesting_elements, scale=100000)

In [222]:
data_normalized_2.head()

Unnamed: 0,OBJECTID,Year,Observatio,Northing,Easting,Northing_k,Easting_kk,Map_sheet_,Map_sheet1,Material,...,Ti_511P_T,V_ppm_511P,V_511P_T,Y_ppm_511P,Y_511P_T,Zn_ppm_511,Zn_511P_T,Zr_ppm_511,Zr_511P_T,geometry
0,2259,81,49114,7526963,404203,7530108,3404337,2741,10,till,...,,354.0,,12.2,,86.7,,0.0,<,POINT (404203.131 7526963.173)
1,3328,81,49282,7530498,406827,7533644,3406962,2741,11,till,...,,76.9,,9.0,,58.5,,0.0,<,POINT (406827.101 7530497.757)
2,3507,81,49269,7536878,406536,7540027,3406671,2741,11,till,...,,46.3,,6.9,,28.7,,79.343065,,POINT (406536.240 7536878.222)
3,4936,81,49245,7524138,401544,7527282,3401677,2741,7,till,...,,42.2,,5.03,,16.7,,57.089587,,POINT (401544.178 7524138.307)
4,4937,81,49283,7530830,405584,7533976,3405718,2741,11,till,...,,69.7,,7.74,,60.6,,0.0,<,POINT (405583.597 7530829.630)


## Additive logratio transform

In [223]:
# The pandas API version of pyrolite.comp:
from pyrolite.comp import pyrocomp

# If working with ndarrays, can use the versions from pyrolite.comp.codata, eg.:
# from pyrolite.comp.codata import ALR, inverse_ALR

Data transformed by using Zr concentrations as the denominator:

In [224]:
ALR_transformed = compositional_data.pyrocomp.ALR(ind='Zr_ppm_511', null_col=True, label_mode='latex')

In [225]:
ALR_transformed.head()

Unnamed: 0,$\ln{\left(\frac{Al_{ppm 511}}{Zr_{ppm 511}} \right)}$,$\ln{\left(\frac{Ca_{ppm 511}}{Zr_{ppm 511}} \right)}$,$\ln{\left(\frac{Mg_{ppm 511}}{Zr_{ppm 511}} \right)}$,$0$
2,6.462484,5.519834,5.911083,0.0
3,6.828767,6.03178,6.016466,0.0
5,11.438667,8.718933,9.149716,0.0
6,6.381181,5.508882,5.749592,0.0
7,5.524591,4.700765,4.947002,0.0


In [226]:
ALR_transformed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, 2 to 1110
Data columns (total 4 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   $\ln{\left(\frac{Al_{ppm 511}}{Zr_{ppm 511}} \right)}$  930 non-null    float64
 1   $\ln{\left(\frac{Ca_{ppm 511}}{Zr_{ppm 511}} \right)}$  930 non-null    float64
 2   $\ln{\left(\frac{Mg_{ppm 511}}{Zr_{ppm 511}} \right)}$  930 non-null    float64
 3   $0$                                                     930 non-null    float64
dtypes: float64(4)
memory usage: 36.3 KB


#### Inverse ALR

Transform back:

In [227]:
ALR_inverse = ALR_transformed.pyrocomp.inverse_ALR(ind='ALR(Zr_ppm_511 / Zr_ppm_511)', null_col=True)

In [228]:
ALR_inverse.head()

Unnamed: 0,Al_ppm_511,Ca_ppm_511,Mg_ppm_511,Zr_ppm_511
2,0.508312,0.198035,0.29286,0.000793
3,0.527537,0.237753,0.23414,0.000571
5,0.856696,0.05645,0.086846,9e-06
6,0.512445,0.214197,0.272491,0.000868
7,0.499005,0.218938,0.280067,0.00199


In [229]:
ALR_inverse.info()

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, 2 to 1110
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Al_ppm_511  930 non-null    float64
 1   Ca_ppm_511  930 non-null    float64
 2   Mg_ppm_511  930 non-null    float64
 3   Zr_ppm_511  930 non-null    float64
dtypes: float64(4)
memory usage: 36.3 KB


## Center logratio transform

In [230]:
CLR_transformed = compositional_data.pyrocomp.CLR(label_mode='latex')

In [231]:
CLR_transformed.head()

Unnamed: 0,$\ln{\left(\frac{Al_{ppm 511}}{γ} \right)}$,$\ln{\left(\frac{Ca_{ppm 511}}{γ} \right)}$,$\ln{\left(\frac{Mg_{ppm 511}}{γ} \right)}$,$\ln{\left(\frac{Zr_{ppm 511}}{γ} \right)}$
2,1.989134,1.046483,1.437733,-4.47335
3,2.109514,1.312527,1.297213,-4.719253
5,4.111838,1.392104,1.822887,-7.326829
6,1.971267,1.098968,1.339678,-4.409914
7,1.731501,0.907676,1.153912,-3.79309


#### Inverse CLR

In [232]:
CLR_inverse = CLR_transformed.pyrocomp.inverse_CLR()

In [233]:
CLR_inverse.head()

Unnamed: 0,Al_ppm_511,Ca_ppm_511,Mg_ppm_511,Zr_ppm_511
2,0.508312,0.198035,0.29286,0.000793
3,0.527537,0.237753,0.23414,0.000571
5,0.856696,0.05645,0.086846,9e-06
6,0.512445,0.214197,0.272491,0.000868
7,0.499005,0.218938,0.280067,0.00199


## Isometric logratio transform

In [234]:
ILR_transformed = compositional_data.pyrocomp.ILR(label_mode='latex')

In [235]:
ILR_transformed.head()

Unnamed: 0,$\frac{\sqrt{2}}{2} \cdot \ln{\left(\frac{Al_{ppm 511}}{Ca_{ppm 511}} \right)}$,$\frac{\sqrt{6}}{6} \cdot \ln{\left(\frac{Al_{ppm 511} \cdot Ca_{ppm 511}}{Mg_{ppm 511}^{2}} \right)}$,$\frac{\sqrt{3}}{6} \cdot \ln{\left(\frac{Al_{ppm 511} \cdot Ca_{ppm 511} \cdot Mg_{ppm 511}}{Zr_{ppm 511}^{3}} \right)}$
2,0.666554,0.065382,5.16538
3,0.563555,0.337873,5.449324
5,1.923142,0.758594,8.460293
6,0.616809,0.159576,5.09213
7,0.582533,0.135274,4.379883


#### Inverse ILR

In [236]:
ILR_inverse = ILR_transformed.pyrocomp.inverse_ILR()

In [237]:
ILR_inverse.head()

Unnamed: 0,Al_ppm_511,Ca_ppm_511,Mg_ppm_511,Zr_ppm_511
2,0.508312,0.198035,0.29286,0.000793
3,0.527537,0.237753,0.23414,0.000571
5,0.856696,0.05645,0.086846,9e-06
6,0.512445,0.214197,0.272491,0.000868
7,0.499005,0.218938,0.280067,0.00199


# Handle zeros in compositional data

Extract all the element concentration data in the given file into a new dataframe:

In [239]:
concentrations = full_data.loc[:,elements].copy()
concentrations.dropna()

Unnamed: 0,Al_ppm_511,Ba_ppm_511,Ca_ppm_511,Co_ppm_511,Cr_ppm_511,Cu_ppm_511,Fe_ppm_511,K_ppm_511P,La_ppm_511,Li_ppm_511,...,Ni_ppm_511,P_ppm_511P,Sc_ppm_511,Sr_ppm_511,Th_ppm_511,Ti_ppm_511,V_ppm_511P,Y_ppm_511P,Zn_ppm_511,Zr_ppm_511
0,27600.0,20.6,40200.0,69.70,73.4,100.0,83200.0,664.0,11.3,14.30,...,61.4,735.0,24.30,22.60,2.97,2590.0,354.0,12.20,86.7,0.0
1,14100.0,22.6,5000.0,20.50,41.0,60.8,28300.0,405.0,10.8,8.16,...,34.2,566.0,5.70,12.00,1.74,2640.0,76.9,9.00,58.5,0.0
2,7880.0,22.9,3070.0,6.94,33.0,24.7,14500.0,710.0,11.7,7.58,...,17.9,458.0,3.12,8.36,5.44,1470.0,46.3,6.90,28.7,12.3
3,7300.0,25.1,3290.0,8.38,25.0,28.4,14600.0,836.0,10.6,5.62,...,13.8,744.0,2.28,6.90,4.42,1050.0,42.2,5.03,16.7,7.9
4,12500.0,25.1,3600.0,29.30,38.9,88.7,31500.0,1260.0,10.9,6.66,...,36.4,592.0,4.17,8.25,2.58,1530.0,69.7,7.74,60.6,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1107,18600.0,299.0,11600.0,16.00,47.5,76.1,39300.0,3910.0,51.5,22.50,...,28.2,2770.0,6.15,32.80,9.93,3690.0,120.0,18.90,56.5,20.9
1108,33400.0,664.0,2390.0,42.30,32.4,71.7,69400.0,9880.0,15.3,21.70,...,43.7,595.0,15.20,5.96,3.48,3840.0,184.0,12.30,30.1,28.9
1109,38300.0,704.0,4070.0,48.30,115.0,158.0,77600.0,7370.0,42.1,26.30,...,96.3,1030.0,3.36,10.50,3.48,2860.0,167.0,16.30,362.0,0.0
1110,32600.0,979.0,8630.0,62.30,33.9,561.0,69300.0,13700.0,33.1,63.70,...,96.9,1330.0,3.45,12.30,2.52,4340.0,155.0,16.40,686.0,39.0


In [258]:
concentrations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1112 entries, 0 to 1111
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Al_ppm_511  1112 non-null   float64
 1   Ba_ppm_511  1112 non-null   float64
 2   Ca_ppm_511  1112 non-null   float64
 3   Co_ppm_511  1112 non-null   float64
 4   Cr_ppm_511  1112 non-null   float64
 5   Cu_ppm_511  1112 non-null   float64
 6   Fe_ppm_511  1112 non-null   float64
 7   K_ppm_511P  1112 non-null   float64
 8   La_ppm_511  1112 non-null   float64
 9   Li_ppm_511  1112 non-null   float64
 10  Mg_ppm_511  1112 non-null   float64
 11  Mn_ppm_511  1112 non-null   float64
 12  Ni_ppm_511  1112 non-null   float64
 13  P_ppm_511P  1112 non-null   float64
 14  Sc_ppm_511  1112 non-null   float64
 15  Sr_ppm_511  1112 non-null   float64
 16  Th_ppm_511  1112 non-null   float64
 17  Ti_ppm_511  1112 non-null   float64
 18  V_ppm_511P  1112 non-null   float64
 19  Y_ppm_511P  1112 non-null  

## Replace zeros with NaN

Ensure all the colums have float data:

In [267]:
import numpy as np

float_columns = [name for (name, type) in zip(concentrations.columns, concentrations.dtypes) if issubclass(type.type, np.floating)]

Replace 0 values with NaN:

In [268]:
concentrations.loc[:, float_columns] = np.where(np.isclose(concentrations[float_columns].values, 0.0, rtol=1e-5, atol=1e-8), np.nan, concentrations[float_columns].values)

In [269]:
concentrations.head()

Unnamed: 0,Al_ppm_511,Ba_ppm_511,Ca_ppm_511,Co_ppm_511,Cr_ppm_511,Cu_ppm_511,Fe_ppm_511,K_ppm_511P,La_ppm_511,Li_ppm_511,...,Ni_ppm_511,P_ppm_511P,Sc_ppm_511,Sr_ppm_511,Th_ppm_511,Ti_ppm_511,V_ppm_511P,Y_ppm_511P,Zn_ppm_511,Zr_ppm_511
0,27600.0,20.6,40200.0,69.7,73.4,100.0,83200.0,664.0,11.3,14.3,...,61.4,735.0,24.3,22.6,2.97,2590.0,354.0,12.2,86.7,
1,14100.0,22.6,5000.0,20.5,41.0,60.8,28300.0,405.0,10.8,8.16,...,34.2,566.0,5.7,12.0,1.74,2640.0,76.9,9.0,58.5,
2,7880.0,22.9,3070.0,6.94,33.0,24.7,14500.0,710.0,11.7,7.58,...,17.9,458.0,3.12,8.36,5.44,1470.0,46.3,6.9,28.7,12.3
3,7300.0,25.1,3290.0,8.38,25.0,28.4,14600.0,836.0,10.6,5.62,...,13.8,744.0,2.28,6.9,4.42,1050.0,42.2,5.03,16.7,7.9
4,12500.0,25.1,3600.0,29.3,38.9,88.7,31500.0,1260.0,10.9,6.66,...,36.4,592.0,4.17,8.25,2.58,1530.0,69.7,7.74,60.6,


In [270]:
concentrations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1112 entries, 0 to 1111
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Al_ppm_511  1112 non-null   float64
 1   Ba_ppm_511  1112 non-null   float64
 2   Ca_ppm_511  1112 non-null   float64
 3   Co_ppm_511  1111 non-null   float64
 4   Cr_ppm_511  1111 non-null   float64
 5   Cu_ppm_511  1112 non-null   float64
 6   Fe_ppm_511  1112 non-null   float64
 7   K_ppm_511P  1110 non-null   float64
 8   La_ppm_511  1112 non-null   float64
 9   Li_ppm_511  1112 non-null   float64
 10  Mg_ppm_511  1112 non-null   float64
 11  Mn_ppm_511  1112 non-null   float64
 12  Ni_ppm_511  1111 non-null   float64
 13  P_ppm_511P  1112 non-null   float64
 14  Sc_ppm_511  1112 non-null   float64
 15  Sr_ppm_511  1112 non-null   float64
 16  Th_ppm_511  1086 non-null   float64
 17  Ti_ppm_511  1112 non-null   float64
 18  V_ppm_511P  1112 non-null   float64
 19  Y_ppm_511P  1112 non-null  

(Note that the zero_to_nan function from pyrolite.util does not work as such. It expects the datatype to be float.)

In [271]:
concentrations.dropna()

Unnamed: 0,Al_ppm_511,Ba_ppm_511,Ca_ppm_511,Co_ppm_511,Cr_ppm_511,Cu_ppm_511,Fe_ppm_511,K_ppm_511P,La_ppm_511,Li_ppm_511,...,Ni_ppm_511,P_ppm_511P,Sc_ppm_511,Sr_ppm_511,Th_ppm_511,Ti_ppm_511,V_ppm_511P,Y_ppm_511P,Zn_ppm_511,Zr_ppm_511
2,7880.0,22.9,3070.0,6.94,33.0,24.70,14500.0,710.0,11.70,7.58,...,17.90,458.0,3.12,8.36,5.44,1470.0,46.3,6.90,28.70,12.300
3,7300.0,25.1,3290.0,8.38,25.0,28.40,14600.0,836.0,10.60,5.62,...,13.80,744.0,2.28,6.90,4.42,1050.0,42.2,5.03,16.70,7.900
5,9470.0,26.0,624.0,1.70,15.1,3.47,8640.0,382.0,9.55,1.94,...,3.48,487.0,0.23,1.64,2.63,325.0,15.3,1.80,6.41,0.102
6,7560.0,26.9,3160.0,8.05,44.7,22.20,14500.0,1020.0,13.00,6.56,...,20.70,459.0,2.81,10.20,6.40,1330.0,37.8,5.96,25.90,12.800
7,8000.0,27.2,3510.0,10.40,28.4,24.00,20200.0,835.0,11.90,8.25,...,20.60,575.0,2.64,9.66,4.47,1660.0,43.0,6.99,25.70,31.900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1105,16000.0,268.0,3970.0,15.90,66.8,147.00,49600.0,3080.0,27.40,16.70,...,77.10,1220.0,4.65,32.80,6.06,1750.0,89.7,12.20,81.80,23.300
1106,13000.0,272.0,4210.0,29.20,69.7,290.00,33800.0,5560.0,38.80,11.40,...,70.80,1000.0,4.58,9.54,3.66,1810.0,83.4,13.10,36.00,18.500
1107,18600.0,299.0,11600.0,16.00,47.5,76.10,39300.0,3910.0,51.50,22.50,...,28.20,2770.0,6.15,32.80,9.93,3690.0,120.0,18.90,56.50,20.900
1108,33400.0,664.0,2390.0,42.30,32.4,71.70,69400.0,9880.0,15.30,21.70,...,43.70,595.0,15.20,5.96,3.48,3840.0,184.0,12.30,30.10,28.900


In [273]:
concentrations_sanitized = concentrations.dropna()

In [274]:
concentrations_sanitized.info()

<class 'pandas.core.frame.DataFrame'>
Index: 914 entries, 2 to 1110
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Al_ppm_511  914 non-null    float64
 1   Ba_ppm_511  914 non-null    float64
 2   Ca_ppm_511  914 non-null    float64
 3   Co_ppm_511  914 non-null    float64
 4   Cr_ppm_511  914 non-null    float64
 5   Cu_ppm_511  914 non-null    float64
 6   Fe_ppm_511  914 non-null    float64
 7   K_ppm_511P  914 non-null    float64
 8   La_ppm_511  914 non-null    float64
 9   Li_ppm_511  914 non-null    float64
 10  Mg_ppm_511  914 non-null    float64
 11  Mn_ppm_511  914 non-null    float64
 12  Ni_ppm_511  914 non-null    float64
 13  P_ppm_511P  914 non-null    float64
 14  Sc_ppm_511  914 non-null    float64
 15  Sr_ppm_511  914 non-null    float64
 16  Th_ppm_511  914 non-null    float64
 17  Ti_ppm_511  914 non-null    float64
 18  V_ppm_511P  914 non-null    float64
 19  Y_ppm_511P  914 non-null    float

## Check for outliers