# Using Croissant in Machine Learning Pipelines 🥐

Croissant provides a single-file JSON-LD format for Machine Learning (ML) datasets that contains information about data sources, data structure and relevant additional metadata. The standardized format aims to improve the discoverability, accessibility, and interoperability of ML datasets. In this notebook we'll demonstrate using an example croissant file (linked to a dataset from the UKCEH Environment Information Data Centre (EIDC)) in an ML-pipeline.

In [1]:
# Installing necessary libraries
%%capture --no-display
# Install mlcroissant from the source
!apt-get install -y python3-dev graphviz libgraphviz-dev pkg-config
!pip install "git+https://github.com/${GITHUB_REPOSITORY:-mlcommons/croissant}.git@${GITHUB_HEAD_REF:-main}#subdirectory=python/mlcroissant&egg=mlcroissant[dev]"
!pip install array_record
!pip install tfds-nightly
!pip install tensorflow
!pip install torch
!apt-get install tree

UsageError: Line magic function `%%capture` not found.


In [None]:
# Importing necessary libraries
from mlcroissant import Dataset
import tensorflow_datasets as tfds
import torch
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

ImportError: DLL load failed while importing _multiarray_umath: The specified module could not be found.

ImportError: numpy._core.multiarray failed to import

## Loading the data

Currently the underlying data described in the croissant file can be loaded directly using either the [mlcroissant](https://github.com/mlcommons/croissant/tree/main/python/mlcroissant) python library or the [tensorflow croissant builder](https://www.tensorflow.org/datasets/format_specific_dataset_builders#croissantbuilder). Here we'll demonstrate both.

In [None]:
# Load the dataset from the croissant file using mlcroissant
croissant_file_path = "/tmp/croissantSpikeZip.json" #"../../croissantSpikeZip.json"
dataset = Dataset(jsonld=croissant_file_path)  # Use mlc.Dataset to parse Croissant metadata
metadata = dataset.metadata.to_json() # Convert the metadata to a JSON object
records = dataset.records(record_set="rs-abberfraw") # Extract records from the dataset
print(metadata['description']) # Display the description of the dataset
df = pd.DataFrame(records) # Convert the records to a pandas dataframe 
print(df.dtypes) # Display the datatypes of the columns
df[:5] # Display the first 5 records

In [None]:
# Load the dataset from the croissant file using tensorflow custom builder
builder = tfds.core.dataset_builders.CroissantBuilder(
    jsonld="/tmp/croissantSpikeZip.json",
    record_set_ids=["rs-abberfraw"],
    file_format='array_record',
    data_dir="/tmp/croissant_ukceh",
)
print(f"Dataset's description:\n{builder.info.description}\n")
print(f"Dataset's citation:\n{builder.info.citation}\n")
print(f"Dataset's features:\n{builder.info.features}")

builder.download_and_prepare() # Download and prepare the dataset
train,test = builder.as_data_source(split=['default[:75%]','default[75%:]'])

print(f"Train dataset size: {len(train)}")
print(f"Test dataset size: {len(test)}")

for i in range(5):
  print(train[i])

## Sci-Kit Learn Pipeline

In [None]:
# Define features and target variable
X = df[["WindSpeed", "Aspect", "Slope"]]
y = df["BareSand_it1"]

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple regression model
model = MLPRegressor(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', max_iter=500, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_train_pred = model.predict(X_train)  # Predictions on training set
y_test_pred = model.predict(X_test)  # Predictions on test set

# Evaluate the model
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"Training Set - Mean Squared Error: {mse_train}")
print(f"Training Set - R-squared Value: {r2_train}")
print(f"Test Set - Mean Squared Error: {mse_test}")
print(f"Test Set - R-squared Value: {r2_test}")

## PyTorch Pipeline

In [None]:
batch_size = 128
train_sampler = torch.utils.data.RandomSampler(train, num_samples=len(train))
train_loader = torch.utils.data.DataLoader(
    train,
    sampler=train_sampler,
    batch_size=batch_size,
)
test_loader = torch.utils.data.DataLoader(
    test,
    sampler=None,
    batch_size=batch_size,
)

for i, batch in enumerate(train_loader):
  print(batch)
  break

In [None]:
class TabularRegressor(torch.nn.Module):
    def __init__(self, input_dim):
        super(TabularRegressor, self).__init__()
        self.regressor = torch.nn.Linear(input_dim, 1)

    def forward(self, features):
        return self.regressor(features)

# Extract feature names and target name
feature_names = ['Aspect', 'Slope', 'WindSpeed']
target_name = 'BareSand_it1'

In [None]:
# Assuming the features are in a tensor of shape (num_features,)
input_dim = len(feature_names)
model = TabularRegressor(input_dim)
optimizer = torch.optim.Adam(model.parameters())
loss_function = torch.nn.MSELoss()

print('Training...')
model.train()
for example in tqdm(train_loader):
    features = torch.stack([example[feature] for feature in feature_names],dim=1).float()
    target = example[target_name].unsqueeze(dim=1).float()
    prediction = model(features)
    loss = loss_function(prediction, target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print('Testing...')
model.eval()
total_loss = 0
num_examples = 0
all_targets = []
all_predictions = []
for example in tqdm(test_loader):
    features = torch.stack([example[feature] for feature in feature_names],dim=1).float()
    target = example[target_name].unsqueeze(dim=1).float()
    prediction = model(features)
    loss = loss_function(prediction, target)
    total_loss += loss.item() * features.shape[0]
    num_examples += features.shape[0]
    all_targets.extend(target.squeeze().tolist())
    all_predictions.extend(prediction.squeeze().tolist())

mean_squared_error = total_loss / num_examples
r2 = r2_score(all_targets, all_predictions)
print(f'\nMean Squared Error: {mean_squared_error:.4f}')
print(f'R-squared: {r2:.4f}')