In [None]:
import rmsp
import sys
import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from matplotlib import pyplot as plt
import numpy as np
import math
import shutil
import copy
import pygeostat as gs
from tqdm.notebook import trange
%load_ext autoreload
%autoreload 2

In [None]:
rmsp.activate()

In [None]:
sys.path.insert(0, os.path.abspath('../Tools'))
from file_export import PickleExporter, FigureExporter, TableExporter
from utility import create_axes

# Introduction

This notebook contains workflow to get the data inventory for lateritic Nickel deposit data set


# Settings

In [None]:
outdir = 'Output/DataInventory/'
gs.mkdir(outdir)

# NWTData.7z can be downloaded from the google drive under CCG folder
inputdir = './data/'

gs.Parameters['data.tmin'] = -998
gs.Parameters['data.null'] = -999

In [None]:
pickle_data = PickleExporter(outdir)
save_figure = FigureExporter(outdir)
save_table = TableExporter(outdir)
save_figure_paper = FigureExporter(
    "../../JournalPapers/ImputationUsingLambdaDistAndMl/Latex/elsarticle-template/Figures_Ni/"
)
save_table_paper = TableExporter(
    "../../JournalPapers/ImputationUsingLambdaDistAndMl/Latex/elsarticle-template/Tables_Ni/"
)

# Data Exploration

There are three types of data collection: Water, Stream and HMC that have different scale and accuracy. In this section the overlap between different data sources is studied.

In [None]:
data_input = rmsp.read_gslib(inputdir+'original.dat')
data_input = rmsp.PointData(data_input, x='X', y='Y', z='Depth', dhid = 'Bore Hole ID')
data_input = data_input.astype({"Rocktype":int})
data_input

In [None]:
p =rmsp.Proportion(data_input['Rocktype'])
p.barplot(plot_count='True', ylim=(0,0.45))

In [None]:
data_input = data_input[data_input['Rocktype']==4]
data_input

# Missing Mechanism

In [None]:
data_export = data_input.copy()

meta = data_input.get_meta(ret_exists=True)
meta = meta[meta['Exists']=='True']
meta = meta['Value'].values.tolist()

response_variables = ['Ni']
missing_variables = ['Fe', 'SiO2']
variables = response_variables+missing_variables
data_export = data_export[meta+variables]
data_export.describe()

##  Missing at Random

Samples are missing based on the important variables that have been sampled.

In [None]:
np.random.seed(69067)

### Missing Dependent

In [None]:
idx = data_export[data_export['Ni']<np.median(data_export['Ni'])].index
idx_removed_sio2 = np.random.choice(idx, size=int(len(idx)/4), replace=False)
idx_removed_fe = np.random.choice(idx_removed_sio2, size=int(len(idx)/6), replace=False)

data_export.loc[idx_removed_fe, 'Fe'] = np.nan
data_export.loc[idx_removed_sio2, 'SiO2'] = np.nan


idx = data_export[data_export['Ni']>np.median(data_export['Ni'])].index
idx_removed_sio2 = np.random.choice(idx, size=int(len(idx)/8), replace=False)
idx_removed_fe = np.random.choice(idx_removed_sio2, size=int(len(idx)/12), replace=False)

data_export.loc[idx_removed_fe, 'Fe'] = np.nan
data_export.loc[idx_removed_sio2, 'SiO2'] = np.nan

### Missing Independent

In [None]:
# # missingness for Fe
# print('Removing samples below median (1/6)')
# idx = data_export[data_export['Ni']<np.median(data_export['Ni'])].index
# idx_removed = np.random.choice(idx, size=int(len(idx)/6), replace=False)
# print(f'Total subset {len(idx)}. Remove number {len(idx_removed)}. Fraction {len(idx_removed)/len(idx):.2f}')
# data_export.loc[idx_removed, 'Fe'] = np.nan

# print('Removing samples above median (1/12)')
# idx = data_export[data_export['Ni']>np.median(data_export['Ni'])].index
# idx_removed = np.random.choice(idx, size=int(len(idx)/12), replace=False)
# print(f'Total subset {len(idx)}. Remove number {len(idx_removed)}. Fraction {len(idx_removed)/len(idx):.2f}')
# data_export.loc[idx_removed, 'Fe'] = np.nan


# already_missing = data_export[data_export['Fe'].isna()].index

# # missingness for SiO2
# print('Removing samples below median (1/4)')
# idx = data_export[data_export['Ni']<np.median(data_export['Ni'])].index
# idx_removed = np.random.choice(idx, size=int(len(idx)/4), replace=False)
# print(f'Total subset {len(idx)}. Remove number {len(idx_removed)}. Fraction {len(idx_removed)/len(idx):.2f}')
# data_export.loc[idx_removed, 'SiO2'] = np.nan

# print('Removing samples above median (1/8)')
# idx = data_export[data_export['Ni']>np.median(data_export['Ni'])].index
# idx_removed = np.random.choice(idx, size=int(len(idx)/8), replace=False)
# print(f'Total subset {len(idx)}. Remove number {len(idx_removed)}. Fraction {len(idx_removed)/len(idx):.2f}')
# data_export.loc[idx_removed, 'SiO2'] = np.nan

In [None]:
data_export.describe()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(9, 11))

gs.PlotStyle.update_mplrcParams()

data_homotopic = data_export[~data_export.isnull().apply(lambda x: any(x), axis=1)]
data_heterotopic = data_export[data_export.isnull().apply(lambda x: any(x), axis=1)]

ax.scatter(
    data_homotopic.X,
    data_homotopic.Y,
    facecolor="gray",
    edgecolor="black",
    s=20,
    lw=0.8,
    label="Homotopic (%i)" % (len(data_homotopic)),
)

ax.scatter(
    data_heterotopic.X,
    data_heterotopic.Y,
    facecolor="orange",
    s=12,
    label="Heterotopic (%i)" % (len(data_heterotopic)),
)
ax.set_aspect("equal")
ax.grid()
legend = ax.legend(fontsize=12, markerscale=2)


ax.set_xlabel("Easting (m)", fontsize=None), ax.set_ylabel(
    "Northing (m)", fontsize=None
)
_ = ax.set_title("Location Map for pooled data set", fontsize=14)

# save_figure_paper("CSlocmap1.png")

In [None]:
from matplotlib_scalebar.scalebar import ScaleBar

rmsp.GlobalParams["core.enable_beta"] = True
viewer = data_homotopic.view3d(color="#808080", label="Homotopic")
viewer._global_settings["scale"] = [1, 1, 5]
viewer._global_settings["background"] = "#FFFFFF"
data_heterotopic.view3d(color="#FFA500", label="Heterotopic", viewer=viewer)
viewer.set_camera(view_matrix=[716886.56,329709.00,225.62,714097.21,330912.07,-436.74,-0.22,0.04,0.98], orthographic=False)
fig, ax = viewer.show_static(
    dpi=200, crop_y=(15, 15), figsize=(8,4)
)
ax.legend()

data_vtk = data_export.copy()
data_vtk['Flag'] = 'Homotopic'
data_vtk.loc[data_vtk.isna().any(axis=1), 'Flag'] = 'Heterotopic'

data_vtk.to_vtk('data.vtp', categorical_to_code=False)
save_figure_paper('3DviewHomHetero.png')

## Duplicate check

In [None]:
data_export, dups = rmsp.remove_duplicates(data_export, progressbar=True)

if len(dups) > 0:
    print(dups)

In [None]:
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt

n_var = len(variables)
fig, axes = create_axes(2, n_var, (12, 6))

for i, variable in enumerate(response_variables + missing_variables):
    data_export.cdfplot(variable, ax=axes[i], log=True)

In [None]:
_ = data_export.scatplots(variables)

In [None]:
all_vars = response_variables + missing_variables
table_summary = pd.DataFrame(
    index=all_vars, columns=["Number of Samples", "Number Missing", "% Missing"]
)

for var in all_vars:
    table_summary.loc[var, "Number of Samples"] = sum(data_export[var].notna())
    table_summary.loc[var, "Number Missing"] = sum(data_export[var].isna())
    table_summary.loc[var, "% Missing"] = (
        100 * sum(data_export[var].isna()) / len(data_export)
    )

table_summary = table_summary.T
table_summary.index.set_names('Variable', inplace=True)
style = table_summary.T.style.format(precision=1, thousands=",")
save_table_paper(style, 'sample_stats.tex', enforce_escape=True, longtable=True, hrules=True)
style

# Export

In [None]:
pickle_data(data_export, 'PooledData.pkl')
data_input = data_input.reset_index(drop=True)
pickle_data(data_input, 'AllData.pkl')
pickle_data(response_variables, 'response_variables.pkl')
pickle_data(missing_variables, 'missing_variables.pkl')