
<link rel="stylesheet" href="https://unpkg.com/thebe@latest/lib/index.css">
<script src="https://unpkg.com/thebe@latest/lib/index.js"></script>

<script type="text/javascript">
  document.addEventListener("DOMContentLoaded", function() {
    thebelab.bootstrap({
      requestKernel: true,
      binderOptions: {
        repo: "your-repo/your-project",
        ref: "main",
      },
      codeMirrorConfig: {
        theme: "abcdef",
      },
    });
  });
</script>


# **Data Readiness For AI Checklist - Part 5**

 * Affiliation: UK Met Office
 * History: 1.0
 * Last update: 25-2-25
 * © British Crown Copyright 2017-2025, Met Office. Please see LICENSE.md for license details.

---

## **Tutorial Material**

* **Run this Jupyter notebook locally using Jupyter Lab**
* **Select 'Run All Cells' from the 'Run' menu to generate the checklist**.
* **Remember to save your notebook regularly as you work through it.**


## **Data section, optional**
Scripts for pulling the data into the notebook assuming

---

## **Setup Notebook**

In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import json
import sys
import os
sys.path.append(os.path.abspath('..')) # Add the parent directory to the system path
from aidatareadiness import utils
from aidatareadiness.utils import WIDGET_WIDTH, DESCRIPTION_STYLE, PLACEHOLDER  

In [None]:
# Load checklist from JSON file:
checklist = utils.load_checklist()

#### Reset stored answers to start again:

In [None]:
# Reset all checklist answers back to original blank answers for all sections.
# Any completed information will be lost. 

# To reset the stored answers uncomment and run these lines of code below. Re-comment the lines afterwards to avoid them running again. 
# utils.reset_checklist()
# checklist = utils.load_checklist()

# You can then re-run each section to reload it on the reset data. 

In [None]:

print("Dataset:", checklist["GeneralInformation"]["DatasetName"])
print("Dataset link:", checklist["GeneralInformation"]["DatasetLink"])
print("Assessor:", checklist["GeneralInformation"]["AssessorName"])
print("Assessor email:", checklist["GeneralInformation"]["AssessorEmailAddress"])

---

## **5. Data Preparation**

### Null values

In [None]:

dataset_null_values = widgets.Combobox(
            value=checklist['DataPreparation']['NullValuesFilled'],
            options=['Yes', 'No', 'N/A'],
            description='5.1 Have null values/gaps been filled?',
            placeholder=PLACEHOLDER,
            layout=widgets.Layout(width=WIDGET_WIDTH),
            style = DESCRIPTION_STYLE
            )

display(dataset_null_values)

### Outliers

In [None]:

dataset_outliers = widgets.Combobox(
            value=checklist['DataPreparation']['OutliersIdentified'],
            options=['Yes, tagged ', 'Yes, removed', 'No', 'N/A'],
            description='5.2 Have outliers been identified?',
            placeholder=PLACEHOLDER,
            layout=widgets.Layout(width=WIDGET_WIDTH),
            style = DESCRIPTION_STYLE
            )

display(dataset_outliers)


### Gridded data


In [None]:

dataset_gridded = widgets.Combobox(
            value=checklist['DataPreparation']['Gridded'],
            options=['Regularly gridded in space', 'Constant time-frequency', 'Regularly gridded in space and constant time-frequency', 'Not gridded', 'N/A'],
            description='5.3 Is the data gridded (regularly sampled in time and space)?',
            placeholder=PLACEHOLDER,
            layout=widgets.Layout(width=WIDGET_WIDTH),
            style = DESCRIPTION_STYLE
            )

dataset_gridded_transformed = widgets.Combobox(
            value=checklist['DataPreparation']['TransformedFromOriginal'],
            options=['Yes, from irregular sampling', 'Yes, from a different regular sampling', 'No, this is the original sampling', 'N/A'],
            description='If the data is gridded, was it transformed from a different original sampling?',            
            placeholder=PLACEHOLDER,
            layout=widgets.Layout(width=WIDGET_WIDTH),
            style = DESCRIPTION_STYLE
            )


dataset_gridded_original_sample = widgets.Combobox(
            value=checklist['DataPreparation']['OriginalSamplingAvailable'],
            options=['Yes', 'No', 'Only by request', 'N/A'],
            description = 'If the data is resampled from the original sampling, is the data also available at the original sampling?',
            placeholder=PLACEHOLDER,
            layout=widgets.Layout(width=WIDGET_WIDTH),
            style = DESCRIPTION_STYLE
            )

display(dataset_gridded, dataset_gridded_transformed, dataset_gridded_original_sample)

### Targets / labels for supervised learning

In [None]:

dataset_targets_or_labels = widgets.Combobox(
            value=checklist['DataPreparation']['SupervisedLearningLabels'],
            options=['Yes', 'No', 'N/A'],
            description = '5.4 Are there associated targets or labels for supervised learning techniques?',
            placeholder='Click to select option - (Can this be used as a training dataset)?',
            layout=widgets.Layout(width=WIDGET_WIDTH),
            style = DESCRIPTION_STYLE
            )

dataset_targets_or_labels_standards_label = widgets.Label(
    value = "If there are associated targets/labels, are community labeling standards implemented?"
)

dataset_targets_or_labels_standards = widgets.Text(
            value=checklist['DataPreparation']['SupervisedLearningLabelStandards'],
            placeholder = 'e.g., STAC label extension, ESA AIREO specification, etc.',
            layout = widgets.Layout(width=WIDGET_WIDTH)
)

display(dataset_targets_or_labels, dataset_targets_or_labels_standards_label, dataset_targets_or_labels_standards)

In [None]:

# Save button
save_button = widgets.Button(description="Save Data Access Answers to json file",  button_style="primary",  layout=widgets.Layout(flex='1 1 auto', width='auto'))

def generate_updates_preparation():

    updates = {
        "DataPreparation": {
            "NullValuesFilled": dataset_null_values.value,
            "OutliersIdentified": dataset_outliers.value,
            "Gridded": dataset_gridded.value,
            "TransformedFromOriginal": dataset_gridded_transformed.value,
            "OriginalSamplingAvailable": dataset_gridded_original_sample.value, 
            "SupervisedLearningLabels" : dataset_targets_or_labels.value,
            "SupervisedLearningLabelStandards" : dataset_targets_or_labels_standards.value,
          
        }
    }
    return updates

save_button.on_click(lambda b: utils.update_checklist(b, generate_updates_preparation()))

display(save_button)

## Finished

In [None]:

button_print_json = widgets.Button(description="Print json results",  button_style='info', layout=widgets.Layout(flex='1 1 auto', width='auto'))
output = widgets.Output()

display(button_print_json, output)

def print_json_info(b):
    """
    Loads a copy of the json file to checklist variable. 
    Then prints the json file contents to Jupyter notebook cell output.

    Arguments: b - represents the button calling the function. 
    """
    checklist = utils.load_checklist()
    with output:
        clear_output()
        for key, value in checklist.items():
            print(f"{key}:")
            if isinstance(value, dict):
                for sub_key, sub_value in value.items():
                    print(f"  {sub_key}: {sub_value}")
            else:
                print(f"  {value}")

button_print_json.on_click(print_json_info)


---

## **Appendix** - Definition of terms used in the checklist.