<a href="https://colab.research.google.com/github/sign-language-processing/datasets/blob/master/examples/load.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
! pip install git+https://github.com/sign-language-processing/datasets.git

In [None]:
import tensorflow_datasets as tfds
import sign_language_datasets.datasets
from sign_language_datasets.datasets.config import SignDatasetConfig

import itertools

# RWTH Phoenix 2014 T

In [None]:
config = SignDatasetConfig(name="only-annotations", version="3.0.0", include_video=False)
rwth_phoenix2014_t = tfds.load(name='rwth_phoenix2014_t', builder_kwargs=dict(config=config))

for datum in itertools.islice(rwth_phoenix2014_t["train"], 0, 10):
  print(datum['gloss'].numpy().decode('utf-8'))
  print(datum['text'].numpy().decode('utf-8'))
  print()

# Dicta Sign

First, we set up HamNoSys in Google Colab for it to be visible.

In [None]:
from IPython.display import HTML

font_url = 'https://ctan.math.illinois.edu/fonts/hamnosys/HamNoSysUnicode.ttf'

# Define the custom CSS to inject
custom_css = f"""
@font-face {{
    font-family: 'HamNoSysUnicode';
    src: url('{font_url}') format('truetype');
}}
"""

# Display the HTML with the custom CSS
HTML(f'<style>{custom_css}</style>')

In [None]:
config = SignDatasetConfig(name="only-annotations", version="1.0.0", include_video=False, include_pose=None)
dicta_sign = tfds.load(name='dicta_sign', builder_kwargs={"config": config})

for datum in itertools.islice(dicta_sign["train"], 0, 10):
    hamnosys_text = datum['hamnosys'].numpy().decode('utf-8')
    plain_text = datum['text'].numpy().decode('utf-8')
    display(HTML(f'<span style="font-family:HamNoSysUnicode;">{hamnosys_text}</span> {plain_text}'))

# ChicagoFSWild+

In [None]:
# Version 2.0.0 is ChicagoFSWild+, 1.0.0 is ChicagoFSWild
config = SignDatasetConfig(name="only-annotations", version="2.0.0", include_video=False)
chicagofswild = tfds.load(name='chicago_fs_wild', builder_kwargs=dict(config=config))

for datum in itertools.islice(chicagofswild["train"], 0, 10):
  print(datum['text'].numpy().decode('utf-8'))

# AUTSL

In [None]:
config = SignDatasetConfig(name="only-annotations", version="1.0.0", include_video=False)
autsl = tfds.load(name='autsl', builder_kwargs={"config": config})

for datum in itertools.islice(autsl["train"], 0, 10):
  print(datum['id'].numpy().decode('utf-8'), datum['gloss_id'].numpy())

# SignBank

In [None]:
signbank = tfds.load(name='sign_bank')

for datum in itertools.islice(signbank["train"], 0, 10):
  print(datum['id'].numpy().decode('utf-8'), datum['sign_writing'].numpy().decode('utf-8'), [f.decode('utf-8') for f in datum['terms'].numpy()])

# SignTyp (https://signtyp.uconn.edu/signpuddle/index.php?ui=1&sgn=9032)


In [None]:
config = SignDatasetConfig(name="only-annotations", version="1.0.0", include_video=False, extra={"PHPSESSID": "hj9co07ct7f5noq529no9u09l4"})
signtyp = tfds.load(name='sign_typ', builder_kwargs=dict(config=config))

for datum in itertools.islice(signtyp["train"], 0, 10):
  print(datum['video'].numpy().decode('utf-8'), datum['sign_writing'].numpy().decode('utf-8'))

# Sign2Mint

In [None]:
config = SignDatasetConfig(name="only-annotations", version="1.0.0", include_video=False)
sign2mint = tfds.load(name='sign2_mint', builder_kwargs={"config": config})

for datum in itertools.islice(sign2mint["train"], 0, 10):
  print(datum['fachbegriff'].numpy().decode('utf-8'), datum['video'].numpy().decode('utf-8'), datum['gebaerdenschrift']['url'].numpy().decode('utf-8'))

# SWOJS Glossário

In [None]:
config = SignDatasetConfig(name="only-annotations", version="1.0.0", include_video=False)
swojs_glossario = tfds.load(name='swojs_glossario', builder_kwargs={"config": config})

def decode(tl):
  return list(map(lambda t: t.decode('utf-8'), tl.numpy()))

for datum in itertools.islice(swojs_glossario["train"], 0, 10):
  print(decode(datum['sign_writing']), datum['video'].numpy().decode('utf-8'))

# DGS Corpus

In [None]:
%%capture
! pip install pympi-ling

## Document Level example (Long videos)

In [None]:
from sign_language_datasets.datasets.dgs_corpus import DgsCorpusConfig

config = DgsCorpusConfig(name="only-annotations", version="1.0.0", include_video=False, include_pose=None)
dgs_corpus = tfds.load('dgs_corpus', builder_kwargs=dict(config=config))

from sign_language_datasets.datasets.dgs_corpus.dgs_utils import get_elan_sentences

for datum in itertools.islice(dgs_corpus["train"], 0, 10):
  elan_path = datum["paths"]["eaf"].numpy().decode('utf-8')
  sentences =  get_elan_sentences(elan_path)

  try:
    sentence = next(sentences)
    print(" ".join([s["gloss"] for s in sentence["glosses"]]))
    print(sentence["german"])
    print()
  except StopIteration:
    pass

## Sentence level example (Videos are broken down to sentences)

In [None]:
from sign_language_datasets.datasets.dgs_corpus import DgsCorpusConfig

config = DgsCorpusConfig(name="only-annotations-sentence-level", version="1.0.0", include_video=False, include_pose=None, data_type="sentence")
dgs_corpus = tfds.load('dgs_corpus', builder_kwargs=dict(config=config))

for datum in itertools.islice(dgs_corpus["train"], 0, 5):
  sentence = datum["sentence"]["german"].numpy().decode('utf-8')
  print(sentence)
  print(datum)

# DGS Types

In [None]:
config = SignDatasetConfig(name="only-annotations", version="1.0.0", include_video=False, include_pose=None, process_video=False)
dgs_types = tfds.load('dgs_types', builder_kwargs=dict(config=config))

for datum in itertools.islice(dgs_types["train"], 0, 10):
  print(datum)

# Sign Suisse

In [None]:
config = SignDatasetConfig(name="only-annotations", version="1.0.0", include_video=False, process_video=False)
sign_suisse = tfds.load('sign_suisse', builder_kwargs=dict(config=config))

for datum in itertools.islice(sign_suisse["train"], 0, 10):
  print(datum)

# NGT Corpus

In [None]:
%%capture
! pip install pympi-ling

In [None]:
from sign_language_datasets.datasets.ngt_corpus.ngt_corpus_utils import get_elan_sentences_ngt_corpus

config = SignDatasetConfig(name="only-annotations", version="1.0.0", include_video=False)
ngt = tfds.load(name='ngt_corpus', builder_kwargs={"config": config})

for datum in itertools.islice(ngt["train"], 0, 10):
  print(datum['id'].numpy().decode('utf-8'))
  elan_path = datum["paths"]["eaf"].numpy().decode('utf-8')

  sentences = get_elan_sentences_ngt_corpus(elan_path)

  for sentence in sentences:
    print(sentence)

# BSL Corpus

In [None]:
from sign_language_datasets.datasets.bsl_corpus.bsl_corpus_utils import get_elan_sentences_bsl_corpus

# this corpus requires a login

BSLCP_USERNAME = ""
BSLCP_PASSWORD = ""

config = SignDatasetConfig(name="only-annotations", version="1.0.0", include_video=False, include_pose=None)

bslcp = tfds.load(name='bsl_corpus', builder_kwargs={"config": config,
                                                     "bslcp_username": BSLCP_USERNAME,
                                                     "bslcp_password": BSLCP_PASSWORD})

for datum in itertools.islice(bslcp["train"], 0, 10):
  print(datum['id'].numpy().decode('utf-8'))
  elan_path = datum["paths"]["eaf"][0].numpy().decode('utf-8')

  sentences = get_elan_sentences_bsl_corpus(elan_path)

  for sentence in sentences:
    print(sentence)

# WMT-SLT

Instructions and example code are here: https://github.com/sign-language-processing/datasets/blob/master/sign_language_datasets/datasets/wmt_slt/README.md