## Crucible Python Client Tutorial

This notebook demonstrates how to use the Crucible Python Client to interact with the Molecular Foundry data lakehouse.

### Setup

First, import the client and initialize it with your API credentials.

In [None]:
import os
from dotenv import load_dotenv
from pycrucible import CrucibleClient

# Load environment variables
load_dotenv()

# Initialize the client
api_url = os.environ.get("crucible_api_url")
api_key = os.environ.get("user_apikey")  # or "admin_apikey" for admin access

client = CrucibleClient(api_url, api_key)

### 1. Searching for Datasets

Use `list_datasets()` to search for datasets with optional filters.

In [None]:
# List all datasets (limited to 10 results)
datasets = client.list_datasets(limit=10)
print(f"Found {len(datasets)} datasets")
print(f"\nFirst dataset: {datasets[0]['dataset_name']}")

#### Available Filters

You can filter datasets using various parameters:

In [None]:
# Filter by keyword
keyword_datasets = client.list_datasets(keyword='tem', limit=5)
print(f"Datasets with keyword 'tem': {len(keyword_datasets)}")

# Filter by instrument
instrument_datasets = client.list_datasets(instrument='titanx', limit=5)
print(f"Datasets from 'titanx' instrument: {len(instrument_datasets)}")

# Filter by owner ORCID
owner_datasets = client.list_datasets(owner_orcid='0009-0001-9493-2006', limit=5)
print(f"Datasets by owner: {len(owner_datasets)}")

# Combine multiple filters
filtered = client.list_datasets(keyword='stem', instrument='titanx', limit=5)
print(f"Datasets matching multiple filters: {len(filtered)}")

In [None]:
# Search datasets by sample ID
sample_id = '0t3q9zq7enrhf0004dvevszkmm'  # Example sample ID
sample_datasets = client.list_datasets(sample_id=sample_id)
print(f"Datasets for sample {sample_id}: {len(sample_datasets)}")

### 2. Adding Datasets

There are two main ways to add datasets: from JSON metadata only, or with a file upload.

#### Option A: Add Dataset from JSON (metadata only)

In [None]:
import mfid  # For generating unique IDs

# Create dataset with metadata only
result = client.build_new_dataset_from_json(
    dataset_name='My New Dataset',
    unique_id=mfid.mfid()[0],
    owner_orcid='0009-0001-9493-2006',
    project_id='MFP08540',
    instrument_name='titanx',
    measurement='haadf',
    public=False,
    scientific_metadata={'voltage': '200kV', 'magnification': '50000x'},
    keywords=['tem', 'nanoparticles']
)

dsid = result['created_record']['unique_id']
print(f"Created dataset: {dsid}")

#### Option B: Add Dataset with File Upload

In [None]:
# Create dataset with file upload and ingestion
file_path = '/path/to/your/data.dm4'

result = client.build_new_dataset_from_file(
    files_to_upload=[file_path],
    dataset_name='Dataset with File',
    unique_id=mfid.mfid()[0],
    owner_orcid='0009-0001-9493-2006',
    project_id='MFP08540',
    instrument_name='titanx',
    measurement='stem',
    scientific_metadata={'exposure_time': '1s', 'beam_current': '100pA'},
    keywords=['stem', 'eels'],
    ingestor='ImageIngestor',  # Optional: specify ingestion class
    wait_for_ingestion_response=True
)

dsid = result['created_record']['unique_id']
print(f"Created dataset with file: {dsid}")
print(f"Ingestion status: {result['ingestion_request']['status']}")

### 3. Updating Datasets

Update existing dataset fields or scientific metadata.

In [None]:
# Update basic dataset fields
dsid = '04qed8jsxd3avcgk7d443rw7t4'  # Example dataset ID

updated = client.update_dataset(
    dsid,
    dataset_name='Updated Dataset Name',
    public=True,
    measurement='stem-eels'
)

print(f"Updated dataset: {updated['dataset_name']}")
print(f"Now public: {updated['public']}")

In [None]:
# Update scientific metadata (merge with existing)
new_metadata = {
    'new_parameter': 'new_value',
    'analysis_date': '2024-01-15'
}

client.update_scientific_metadata(dsid, new_metadata, overwrite=False)
print("Scientific metadata updated (merged)")

# Or overwrite all scientific metadata
complete_metadata = {
    'voltage': '300kV',
    'magnification': '100000x'
}

client.update_scientific_metadata(dsid, complete_metadata, overwrite=True)
print("Scientific metadata replaced")

In [None]:
# Add keywords to a dataset
client.add_dataset_keyword(dsid, 'nanomaterials')
client.add_dataset_keyword(dsid, 'characterization')
print("Keywords added")

### 4. Downloading Datasets

Download dataset files to your local machine.

In [None]:
# Download dataset (uses file_to_upload field from dataset)
dsid = '04qed8jsxd3avcgk7d443rw7t4'

client.download_dataset(dsid)
print(f"Dataset downloaded to: crucible-downloads/")

In [None]:
# Download with custom output path
client.download_dataset(
    dsid,
    file_name='data.dm4',
    output_path='/custom/path/data.dm4'
)
print("Downloaded to custom location")

### 5. Working with Samples

Create samples and link them to datasets.

#### Create a New Sample

In [None]:
# Create a new sample
sample = client.add_sample(
    unique_id=mfid.mfid()[0],
    sample_name='Au Nanoparticles Batch 42',
    description='Gold nanoparticles synthesized via citrate reduction',
    owner_orcid='0009-0001-9493-2006',
    creation_date='2024-01-15'
)

sample_id = sample['unique_id']
print(f"Created sample: {sample_id}")

#### Link Sample to Dataset

In [None]:
# Link an existing sample to a dataset
dataset_id = '04qed8jsxd3avcgk7d443rw7t4'
sample_id = '0t3q9zq7enrhf0004dvevszkmm'

link = client.add_sample_to_dataset(dataset_id, sample_id)
print(f"Linked sample {sample_id} to dataset {dataset_id}")

#### Get Datasets for a Sample

In [None]:
# Find all datasets associated with a sample
sample_datasets = client.list_datasets(sample_id=sample_id)
print(f"Found {len(sample_datasets)} datasets for sample {sample_id}")

for ds in sample_datasets[:3]:  # Show first 3
    print(f"  - {ds['dataset_name']} ({ds['unique_id']})")

#### Get Sample Information

In [None]:
# Retrieve sample details
sample = client.get_sample(sample_id)
print(f"Sample name: {sample['sample_name']}")
print(f"Description: {sample.get('description', 'N/A')}")
print(f"Owner ORCID: {sample.get('owner_orcid', 'N/A')}")

### Additional Examples

In [None]:
# Get full dataset information with metadata
dataset = client.get_dataset(dsid, include_metadata=True)
print(f"Dataset: {dataset['dataset_name']}")
print(f"Instrument: {dataset.get('instrument_name', 'N/A')}")
print(f"Scientific metadata: {dataset.get('scientific_metadata', {})}")

In [None]:
# List available projects
projects = client.list_projects(limit=5)
for proj in projects:
    print(f"{proj['project_id']}: {proj.get('title', 'N/A')}")

In [None]:
# List available instruments
instruments = client.list_instruments(limit=5)
for inst in instruments:
    print(f"{inst['instrument_name']} (Location: {inst.get('location', 'N/A')})")