This notebook: 
- Loads project file from GitHub
- Loads assets from GitHub repo
- installs the custom language object 
- converts the training data to spaCy binary
- configure the project.yml file 
- train the model 
- assess performance 
- package the model (or push to huggingface) 


In [3]:
!python -m spacy project clone newlang_project --repo https://github.com/New-Languages-for-NLP/repo-template --branch main

[38;5;2m✔ Cloned 'newlang_project' from New-Languages-for-NLP/repo-template[0m
/home/ds/projects/course-materials/w2/using-inception-data/newlang_project
[38;5;2m✔ Your project is now ready![0m
To fetch the assets, run:
python -m spacy project assets /home/ds/projects/course-materials/w2/using-inception-data/newlang_project


In [None]:
# %load /home/ds/projects/course-materials/w2/using-inception-data/newlang_project/project.yml
title: "Train new language model from cadet and inception data"
description: "This project template lets you train a part-of-speech tagger, morphologizer and dependency parser from your cadet and inception data.  Adapted from https://github.com/explosion/projects/blob/v3/pipelines/tagger_parser_ud/project.yml"

# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
  config: "default"
  lang: "yo"
  treebank: "yoruba"
  train_name: "en_ewt-ud-train"
  dev_name: "en_ewt-ud-dev"
  test_name: "en_ewt-ud-test"
  package_name: "new language"
  package_version: "0.0.0"
  gpu: -1

# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["assets", "corpus", "training", "metrics", "configs", "packages"]

assets:
  - dest: "assets/${vars.treebank}"
    git:
      repo: "https://github.com/New-Languages-for-NLP/${vars.treebank}"
      branch: "main"
      path: ""

workflows:
  all:
    - preprocess
    - train
    - evaluate
    - package

commands:
  - name: preprocess
    help: "Convert the data to spaCy's format"
    script:
      - "mkdir -p corpus/${vars.treebank}"
      - "python -m spacy convert assets/${vars.treebank}/${vars.train_name}.conllu corpus/${vars.treebank}/ --converter conllu --n-sents 10 --merge-subtokens"
      - "python -m spacy convert assets/${vars.treebank}/${vars.dev_name}.conllu corpus/${vars.treebank}/ --converter conllu --n-sents 10 --merge-subtokens"
      - "python -m spacy convert assets/${vars.treebank}/${vars.test_name}.conllu corpus/${vars.treebank}/ --converter conllu --n-sents 10 --merge-subtokens"
      - "mv corpus/${vars.treebank}/${vars.train_name}.spacy corpus/${vars.treebank}/train.spacy"
      - "mv corpus/${vars.treebank}/${vars.dev_name}.spacy corpus/${vars.treebank}/dev.spacy"
      - "mv corpus/${vars.treebank}/${vars.test_name}.spacy corpus/${vars.treebank}/test.spacy"
    deps:
      - "assets/${vars.treebank}/${vars.train_name}.conllu"
      - "assets/${vars.treebank}/${vars.dev_name}.conllu"
      - "assets/${vars.treebank}/${vars.test_name}.conllu"
    outputs:
      - "corpus/${vars.treebank}/train.spacy"
      - "corpus/${vars.treebank}/dev.spacy"
      - "corpus/${vars.treebank}/test.spacy"

  - name: train
    help: "Train ${vars.treebank}"
    script:
      - "python -m spacy train configs/${vars.config}.cfg --output training/${vars.treebank} --gpu-id ${vars.gpu} --paths.train corpus/${vars.treebank}/train.spacy --paths.dev corpus/${vars.treebank}/dev.spacy --nlp.lang=${vars.lang}"
    deps:
      - "corpus/${vars.treebank}/train.spacy"
      - "corpus/${vars.treebank}/dev.spacy"
      - "configs/${vars.config}.cfg"
    outputs:
      - "training/${vars.treebank}/model-best"

  - name: evaluate
    help: "Evaluate on the test data and save the metrics"
    script:
      - "python -m spacy evaluate ./training/${vars.treebank}/model-best ./corpus/${vars.treebank}/test.spacy --output ./metrics/${vars.treebank}.json --gpu-id ${vars.gpu}"
    deps:
      - "training/${vars.treebank}/model-best"
      - "corpus/${vars.treebank}/test.spacy"
    outputs:
      - "metrics/${vars.treebank}.json"

  - name: package
    help: "Package the trained model so it can be installed"
    script:
      - "python -m spacy package training/${vars.treebank}/model-best packages --name ${vars.package_name} --version ${vars.package_version} --force"
    deps:
      - "training/${vars.treebank}/model-best"
    outputs_no_cache:
      - "packages/${vars.lang}_${vars.package_name}-${vars.package_version}/dist/en_${vars.package_name}-${vars.package_version}.tar.gz"

  - name: clean
    help: "Remove intermediate files"
    script:
      - "rm -rf training/*"
      - "rm -rf metrics/*"
      - "rm -rf corpus/*"


In [4]:
!python -m spacy project assets /home/ds/projects/course-materials/w2/using-inception-data/newlang_project

[38;5;4mℹ Fetching 1 asset(s)[0m
[38;5;2m✔ Downloaded asset
/home/ds/projects/course-materials/w2/using-inception-data/newlang_project/assets/yoruba[0m


In [5]:
!ls /home/ds/projects/course-materials/w2/using-inception-data/newlang_project/assets/yoruba

0_original_texts  2_new_language_object  4_trained_models  README.md
1_lookups_data	  3_inception_export	 LICENSE


In [None]:
# Install the custom language object from Cadet 


In [None]:
# Read data files, test/train split and convert to spaCy files

In [None]:
# Debug the data 

In [None]:
# train the model

In [None]:
# Evaluate the model 

In [2]:
# temp to clear project folder
!rm -rf /home/ds/projects/course-materials/w2/using-inception-data/newlang_project