In [1]:
import pickle
import torch
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
import os

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
gdf = gpd.read_file("data/datacube_mpm.gpkg")
gdf.shape

(1820815, 36)

In [4]:
data_columns = [
    "Lawley_UltramaficMafic", 
    "Lawley_Period_max_minority",
    "Lawley_Lithology_majority",
    "McCafferty_Magnetic_HGM_mean",
    "Lawley_Period_max_majority",
    "Ebbing_GOCE_MaxCurve_mean",
    "Lawley_Period_min_minority",
    "Lawley_Lithology_minority",
    "McCafferty_Gravity_Bouguer_HGM_mean",
    "Eglington_Terrane_Proximity_mean",
    "McCafferty_Moho_mean",
    "Lawley_Period_min_majority",
    "Ebbing_GOCE_MeanCurve_mean",
    "Styron_Fault_mean",
    "Czarnota_LAB_mean",
    "Ebbing_GOCE_SI_mean",
    "McCafferty_Gravity_Bouguer_UpCont30km_HGM_mean",
    "Ebbing_GOCE_MinCurve_mean",
    "McCafferty_Magnetic_1VD_mean",
    "McCafferty_Magnetic_Deep_HGM_mean",
    "McCafferty_Gravity_Bouguer_mean",
    "Lawley_Sedimentary",
    "Lawley_Igneous",
    "McCafferty_Magnetic_HGM_Worms_Proximity_mean",
    "McCafferty_Magnetic_Deep_HGM_Worms_Proximity_mean",
    "McCafferty_Gravity_HGM_Worms_Proximity_mean",
    "McCafferty_Gravity_UpCont30km_HGM_Worms_Proximity_mean",
    "Graham_PassiveMargin_Proximity_mean",
    "Granitto_BlackShale_Proximity_mean",
]

In [5]:
categorical = ["Lawley_UltramaficMafic", "Lawley_Period_max_minority", "Lawley_Lithology_majority", "Lawley_Period_max_majority", "Lawley_Period_min_minority", "Lawley_Lithology_minority", "Lawley_Period_min_majority", "Lawley_Sedimentary", "Lawley_Igneous"]

In [6]:
cols_to_plot = [
    "McCafferty_Magnetic_HGM_mean",
    "Ebbing_GOCE_MaxCurve_mean",
    "McCafferty_Gravity_Bouguer_HGM_mean",
    "Eglington_Terrane_Proximity_mean",
    "McCafferty_Moho_mean",
    "Ebbing_GOCE_MeanCurve_mean",
    "Styron_Fault_mean",
    "Czarnota_LAB_mean",
    "Ebbing_GOCE_SI_mean",
    "McCafferty_Gravity_Bouguer_UpCont30km_HGM_mean",
    "Ebbing_GOCE_MinCurve_mean",
    "McCafferty_Magnetic_1VD_mean",
    "McCafferty_Magnetic_Deep_HGM_mean",
    "McCafferty_Gravity_Bouguer_mean",
    "McCafferty_Magnetic_HGM_Worms_Proximity_mean",
    "McCafferty_Magnetic_Deep_HGM_Worms_Proximity_mean",
    "McCafferty_Gravity_HGM_Worms_Proximity_mean",
    "McCafferty_Gravity_UpCont30km_HGM_Worms_Proximity_mean",
    "Graham_PassiveMargin_Proximity_mean",
    "Granitto_BlackShale_Proximity_mean",
]

In [None]:
fig, axs = plt.subplots(nrows=5, ncols=4, figsize=(16, 16))
axs = axs.flatten()

for i, col_name in enumerate(cols_to_plot):
    axs[i].hist(gdf[col_name], bins=50)
    axs[i].set_title(col_name)

fig.tight_layout()
plt.show()

In [None]:
# Splitting data into training and testing sets
train_idx = gdf["Train"] == "train"
test_idx = gdf["Train"] == "test"

In [None]:
# Raw dataframes for features and labels for each set
X_train_raw = gdf.loc[train_idx, data_columns].copy()
X_test_raw = gdf.loc[test_idx, data_columns].copy()

In [None]:
y_train = gdf.loc[train_idx, ["MINERAL"]].replace({"barren": 0, "mineral": 1})
y_test = gdf.loc[test_idx, ["MINERAL"]].replace({"barren": 0, "mineral": 1})

In [None]:
# Capping outliers
outlier_percentages = {}
for col in data_columns:
    if col not in categorical:
        # Learn parameters from the training set
        Q1 = X_train_raw[col].quantile(0.25)
        Q3 = X_train_raw[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        percentile_value = X_train_raw[col].quantile(0.95)
        
        # Apply the learned parameters to cap both train and test sets
        X_train_raw[col] = np.where((X_train_raw[col] < lower_bound) | (X_train_raw[col] > upper_bound), percentile_value, X_train_raw[col])
        X_test_raw[col] = np.where((X_test_raw[col] < lower_bound) | (X_test_raw[col] > upper_bound), percentile_value, X_test_raw[col])

print("Outliers capped successfully.")

In [None]:
# Missing value  imputation
numerical_cols = [c for c in data_columns if c not in categorical]
train_means = X_train_raw[numerical_cols].mean()

X_train_raw[numerical_cols] = X_train_raw[numerical_cols].fillna(train_means)
X_test_raw[numerical_cols] = X_test_raw[numerical_cols].fillna(train_means)

# Remaining NaNs in categorical columns with 0
X_train_raw[categorical] = X_train_raw[categorical].fillna(0)
X_test_raw[categorical] = X_test_raw[categorical].fillna(0)

In [None]:
# One hot encoding the categorical data
X_train_encoded = pd.get_dummies(X_train_raw, columns=categorical, dtype=int)
X_test_encoded = pd.get_dummies(X_test_raw, columns=categorical, dtype=int)

train_cols, test_cols = X_train_encoded.columns, X_test_encoded.columns
all_cols = sorted(list(set(train_cols) | set(test_cols)))

X_train_aligned = X_train_encoded.reindex(columns=all_cols, fill_value=0)
X_test_aligned = X_test_encoded.reindex(columns=all_cols, fill_value=0)

In [None]:
# Scaling features
scaler = MinMaxScaler()

scaler.fit(X_train_aligned)

data_train = scaler.transform(X_train_aligned)
data_test = scaler.transform(X_test_aligned)

print(f"Final training features shape: {data_train.shape}")
print(f"Final testing features shape: {data_test.shape}")

In [None]:
os.makedirs('./data', exist_ok=True)

In [None]:
torch.save(torch.tensor(data_train).float(), './data/dataset_train.pt')
torch.save(torch.tensor(data_test).float(), './data/dataset_test.pt')

torch.save(torch.tensor(y_train.values).float(), './data/mineral_train.pt')
torch.save(torch.tensor(y_test.values).float(), './data/mineral_test.pt')

In [None]:
# Total dataset
total_features_unscaled = pd.concat([X_train_aligned, X_test_aligned])
# Reorder to match the original gdf index to be safe
total_features_unscaled = total_features_unscaled.reindex(gdf.index)

total_labels = pd.concat([y_train, y_test])
total_labels = total_labels.reindex(gdf.index)

torch.save(torch.tensor(total_features_unscaled.values).float(), './data/total_train.pt')
torch.save(torch.tensor(total_labels.values).float(), './data/total_mineral.pt')

In [None]:
with open('./data/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)