This notebook: 
- Loads project file from GitHub
- Loads assets from GitHub repo
- installs the custom language object 
- converts the training data to spaCy binary
- configure the project.yml file 
- train the model 
- assess performance 
- package the model (or push to huggingface) 


In [4]:
# temp to clear project folder
!rm -rf /srv/projects/course-materials/w2/using-inception-data/newlang_project


In [5]:
private_repo = False #@param {type:"boolean"}
repo_name = "repo-template" #@param {type:"string"}

!rm -rf /content/newlang_project
!rm -rf $repo_name
if private_repo:
    git_access_token = "" #@param {type:"string"}
    git_url = f"https://{git_access_token}@github.com/New-Languages-for-NLP/{repo_name}/"
    !git clone $git_url  -b main
    !cp -r ./$repo_name/newlang_project .  
    !mkdir newlang_project/assets/
    !mkdir newlang_project/configs/
    !mkdir newlang_project/corpus/
    !mkdir newlang_project/metrics/
    !mkdir newlang_project/packages/
    !mkdir newlang_project/training/
    !mkdir newlang_project/assets/$repo_name
    !cp -r ./$repo_name/* newlang_project/assets/$repo_name/
    !rm -rf ./$repo_name
else:
    !python -m spacy project clone newlang_project --repo https://github.com/New-Languages-for-NLP/$repo_name --branch main
    !python -m spacy project assets /srv/projects/course-materials/w2/using-inception-data/newlang_project

[38;5;2m✔ Cloned 'newlang_project' from New-Languages-for-NLP/repo-template[0m
/srv/projects/course-materials/w2/using-inception-data/newlang_project
[38;5;2m✔ Your project is now ready![0m
To fetch the assets, run:
python -m spacy project assets /srv/projects/course-materials/w2/using-inception-data/newlang_project
[38;5;4mℹ Fetching 1 asset(s)[0m
[38;5;2m✔ Downloaded asset
/srv/projects/course-materials/w2/using-inception-data/newlang_project/assets/urban-giggle[0m


In [6]:
# Install the custom language object from Cadet 
!python -m spacy project run install /srv/projects/course-materials/w2/using-inception-data/newlang_project

[1m
Running command: rm -rf lang
Running command: mkdir lang
Running command: mkdir lang/yi
Running command: cp -r assets/urban-giggle/2_new_language_object/ lang/yi/yi
Running command: mv lang/yi/yi/setup.py lang/yi/
Running command: /srv/projects/course-materials/w2/venv/bin/python -m pip install -e lang/yi
Obtaining file:///srv/projects/course-materials/w2/using-inception-data/newlang_project/lang/yi
Installing collected packages: yi
  Attempting uninstall: yi
    Found existing installation: yi 0.0.0
    Uninstalling yi-0.0.0:
      Successfully uninstalled yi-0.0.0
  Running setup.py develop for yi
Successfully installed yi


In [2]:
# Create training config
!python -m spacy project run config /srv/projects/course-materials/w2/using-inception-data/newlang_project

[1m
Running command: /srv/projects/course-materials/w2/venv/bin/python -m spacy init config config.cfg --lang yi -F
[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: yi
- Pipeline: tagger, parser, ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
Running command: /srv/projects/course-materials/w2/venv/bin/python scripts/update_config.py urban-giggle false


In [8]:
# Convert the conllu files from inception to spaCy binary format
# Currently requires edit to spacy/training/converters/conllu_to_docs.py line 194 
# if pos == "_":                                                                                                                  
#     pos = ""

!python -m spacy project run convert /srv/projects/course-materials/w2/using-inception-data/newlang_project -F

[1m
Running command: mkdir -p ./corpus/conllu
Running command: mkdir -p ./corpus/conll
Running command: mkdir -p ./corpus/converted
Running command: /srv/projects/course-materials/w2/venv/bin/python scripts/convert.py assets/urban-giggle/3_inception_export 10 yi
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (108 documents):
corpus/converted/cu_proiel-ud-dev.spacy[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (115 documents):
corpus/converted/cu_proiel-ud-test.spacy[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (413 documents):
corpus/converted/cu_proiel-ud-train.spacy[0m
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
Traceback (most recent call last):
  File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.8/runpy.py", line 87, in _run_cod

In [3]:
# Read data files, convert to spaCy files
# test/train split 
!python -m spacy project run split /srv/projects/course-materials/w2/using-inception-data/newlang_project -F

[1m
Running command: /srv/projects/course-materials/w2/venv/bin/python scripts/split.py 0.2 11 yi
🚂 Created 1017 training docs
😊 Created 204 validation docs
🧪  Created 51 test docs


In [4]:
# Debug the data
!python -m spacy project run debug  /srv/projects/course-materials/w2/using-inception-data/newlang_project

[1m
Running command: /srv/projects/course-materials/w2/venv/bin/python -m spacy debug data ./config.cfg
[1m
[38;5;2m✔ Pipeline can be initialized with data[0m
[38;5;2m✔ Corpus is loadable[0m
[1m
Language: yi
Training pipeline: tok2vec, tagger, parser, ner
1017 training docs
204 evaluation docs
[38;5;3m⚠ 166 training examples also in evaluation data[0m
[38;5;3m⚠ Low number of examples to train a new pipeline (1017)[0m
[1m
[38;5;4mℹ 92587 total word(s) in the data (9854 unique)[0m
[38;5;4mℹ No word vectors present in the package[0m
[1m
[38;5;4mℹ 0 label(s)[0m
0 missing value(s) (tokens with '-' label)
[38;5;2m✔ Good amount of examples for all labels[0m
[38;5;2m✔ Examples without occurrences available for all labels[0m
[38;5;2m✔ No entities consisting of or starting/ending with whitespace[0m
[1m
[38;5;4mℹ 23 label(s) in train data[0m
[1m
[38;5;4mℹ Found 10138 sentence(s) with an average length of 9.1 words.[0m
[38;5;4mℹ Found 824 nonprojective train sentenc

In [5]:
# Train the model 
!python -m spacy project run train /srv/projects/course-materials/w2/using-inception-data/newlang_project

[1m
Running command: /srv/projects/course-materials/w2/venv/bin/python -m spacy train config.cfg --output training/urban-giggle --gpu-id -1 --nlp.lang=yi
[38;5;4mℹ Saving to output directory: training/urban-giggle[0m
[38;5;4mℹ Using CPU[0m
[1m
[2021-12-28 15:40:27,039] [INFO] Set up nlp object from config
[2021-12-28 15:40:27,047] [INFO] Pipeline: ['tok2vec', 'tagger', 'parser', 'ner']
[2021-12-28 15:40:27,051] [INFO] Created vocabulary
[2021-12-28 15:40:27,051] [INFO] Finished initializing nlp object
[2021-12-28 15:40:31,912] [INFO] Initialized pipeline components: ['tok2vec', 'tagger', 'parser', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'tagger', 'parser', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TAGGER  LOSS PARSER  LOSS NER  TAG_ACC  DEP_UAS  DEP_LAS  SENTS_F  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  -----------  -----------  --------  -------  -------  -------  -------  ------ 

In [None]:
# Evaluate the model 
!python -m spacy project run evaluate /srv/projects/course-materials/w2/using-inception-data/newlang_project

In [1]:
# Package the model 
!mkdir ./export 
!python -m spacy package ./newlang_project/training/urban-giggle/model-last ./export 

/bin/bash: python: command not found


python: can't open file 'prodigy': [Errno 2] No such file or directory
