Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,4 @@ repos:
- id: black
name: Format code
args: [--skip-string-normalization, --line-length=119]
additional_dependencies: ['click==8.0.2']
additional_dependencies: ['click>=8.0.2']
132 changes: 74 additions & 58 deletions Jenkinsfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
pipeline {
agent {
docker {
image 'tnitn_ci:py310'
args '--user 0:128 -v /home/jenkinsci:/home/jenkinsci -v $HOME/.cache:/root/.cache --shm-size=4g --entrypoint=""'
image 'tnitn_ci_py310:24.07'
args '-v /mnt/jenkins/jenkinsci:/home/jenkins -v $HOME/.cache:/root/.cache --shm-size=4g --entrypoint=""'
}
}
options {
Expand All @@ -11,33 +11,27 @@ pipeline {
}
environment {

AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0'
DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-23-24-0'
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-04-24-0'
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-25-24-0'
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0'
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-07-25-0'
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-13-24-0'
IT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-22-24-0'
HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-22-25-0'
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
AR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-24-24-0'
DE_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-23-24-0'
EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-04-24-0'
ES_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-25-24-0'
ES_EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/08-30-24-0'
FR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-07-25-0'
HU_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/07-16-24-0'
PT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
RU_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
VI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
SV_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
ZH_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/11-13-24-0'
IT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/08-22-24-0'
HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0'
MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1'
JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1'
HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-22-25-0'
DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
}
stages {

stage('Add git safe directory'){
steps{
sh 'git config --global --add safe.directory /var/lib/jenkins/workspace/NTP_$GIT_BRANCH'
sh 'git config --global --add safe.directory /home/jenkinsci/workspace/NTP_$GIT_BRANCH'
}
}

stage('PyTorch version') {
steps {
Expand All @@ -46,14 +40,6 @@ pipeline {
}
}

stage('Install test requirements') {
steps {
sh 'apt-get update && apt-get install -y bc'
}
}



stage('NeMo Installation') {
steps {
sh './reinstall.sh release'
Expand All @@ -65,7 +51,10 @@ pipeline {
when {
anyOf {
branch 'main'
branch 'staging/**'
branch 'staging_*'
changeRequest target: 'main'

}
}
failFast true
Expand Down Expand Up @@ -97,6 +86,8 @@ pipeline {
when {
anyOf {
branch 'main'
branch 'staging/**'
branch 'staging_*'
changeRequest target: 'main'
}
}
Expand All @@ -120,6 +111,8 @@ pipeline {
when {
anyOf {
branch 'main'
branch 'staging/**'
branch 'staging_*'
changeRequest target: 'main'
}
}
Expand Down Expand Up @@ -156,7 +149,9 @@ pipeline {
stage('L0: Create AR TN/ITN Grammars') {
when {
anyOf {
branch 'main'
branch 'main'
branch 'staging/**'
branch 'staging_*'
changeRequest target: 'main'
}
}
Expand All @@ -176,10 +171,12 @@ pipeline {
}
}

stage('L0: Create FR TN/ITN & VI ITN & HU TN & IT TN') {
stage('L0: Create FR TN/ITN & VI TN/ITN & HU TN & IT TN') {
when {
anyOf {
branch 'main'
branch 'main'
branch 'staging/**'
branch 'staging_*'
changeRequest target: 'main'
}
}
Expand All @@ -200,6 +197,11 @@ pipeline {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=vi --text="một ngàn " --cache_dir ${VI_TN_CACHE}'
}
}
stage('L0: VI TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=vi --text="100" --cache_dir ${VI_TN_CACHE}'
}
}
stage('L0: HU TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hu --text="100" --cache_dir ${HU_TN_CACHE}'
Expand All @@ -216,7 +218,9 @@ pipeline {
stage('L0: Create RU TN/ITN Grammars & SV & PT') {
when {
anyOf {
branch 'main'
branch 'main'
branch 'staging/**'
branch 'staging_*'
changeRequest target: 'main'
}
}
Expand Down Expand Up @@ -258,7 +262,9 @@ pipeline {
stage('L0: Create HY TN/ITN Grammars & MR') {
when {
anyOf {
branch 'main'
branch 'main'
branch 'staging/**'
branch 'staging_*'
changeRequest target: 'main'
}
}
Expand All @@ -284,7 +290,9 @@ pipeline {
stage('L0: Create ZH TN/ITN Grammar') {
when {
anyOf {
branch 'main'
branch 'main'
branch 'staging/**'
branch 'staging_*'
changeRequest target: 'main'
}
}
Expand All @@ -305,7 +313,9 @@ pipeline {
stage('L0: Create JA ITN Grammars') {
when {
anyOf {
branch 'main'
branch 'main'
branch 'staging/**'
branch 'staging_*'
changeRequest target: 'main'
}
}
Expand All @@ -325,7 +335,9 @@ pipeline {
stage('L1: TN/ITN Tests CPU') {
when {
anyOf {
branch 'main'
branch 'main'
branch 'staging/**'
branch 'staging_*'
changeRequest target: 'main'
}
}
Expand Down Expand Up @@ -409,10 +421,12 @@ pipeline {
}
}

stage('L2: Sparrowhawk Tests') {
stage('L2: EN Sparrowhawk Tests') {
when {
anyOf {
branch 'main'
branch 'main'
branch 'staging/**'
branch 'staging_*'
changeRequest target: 'main'
}
}
Expand Down Expand Up @@ -441,54 +455,56 @@ pipeline {

}
}

stage('L2: NeMo text processing') {
when {
anyOf {
branch 'main'
branch 'main'
branch 'staging/**'
branch 'staging_*'
changeRequest target: 'main'
}
}
failFast true
parallel {
stage('L2: Eng TN') {
steps {
sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && \
sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkins/TestData/text_norm/output_${TIME} && \
cd tools/text_processing_deployment && python pynini_export.py --output=$NORM_OUTPUT_DIR --grammars=tn_grammars --cache_dir ${EN_TN_CACHE} --language=en && ls -R $NORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \
cd nemo_text_processing/text_normalization/ && python normalize.py --input_file=/home/jenkinsci/TestData/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=$NORM_OUTPUT_DIR/test.pynini.txt --verbose && \
sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkins/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \
cd nemo_text_processing/text_normalization/ && python normalize.py --input_file=/home/jenkins/TestData/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=$NORM_OUTPUT_DIR/test.pynini.txt --verbose && \
cat $NORM_OUTPUT_DIR/test.pynini.txt && \
cmp --silent $NORM_OUTPUT_DIR/test.pynini.txt /home/jenkinsci/TestData/text_norm/ci/test_goal_py.txt || exit 1 && \
cmp --silent $NORM_OUTPUT_DIR/test.pynini.txt /home/jenkins/TestData/text_norm/ci/test_goal_py.txt || exit 1 && \
rm -rf $NORM_OUTPUT_DIR'
}
}

stage('L2: Eng ITN export') {
steps {
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && \
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkins/TestData/text_denorm/output_${TIME} && \
cd tools/text_processing_deployment && python pynini_export.py --output=$DENORM_OUTPUT_DIR --grammars=itn_grammars --cache_dir ${EN_TN_CACHE} --language=en && ls -R $DENORM_OUTPUT_DIR && echo ".far files created "|| exit 1'
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \
cd nemo_text_processing/inverse_text_normalization/ && python inverse_normalize.py --input_file=/home/jenkinsci/TestData/text_denorm/ci/test.txt --language=en --output_file=$DENORM_OUTPUT_DIR/test.pynini.txt --verbose && \
cmp --silent $DENORM_OUTPUT_DIR/test.pynini.txt /home/jenkinsci/TestData/text_denorm/ci/test_goal_py.txt || exit 1 && \
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkins/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \
cd nemo_text_processing/inverse_text_normalization/ && python inverse_normalize.py --input_file=/home/jenkins/TestData/text_denorm/ci/test.txt --language=en --output_file=$DENORM_OUTPUT_DIR/test.pynini.txt --verbose && \
cmp --silent $DENORM_OUTPUT_DIR/test.pynini.txt /home/jenkins/TestData/text_denorm/ci/test_goal_py.txt || exit 1 && \
rm -rf $DENORM_OUTPUT_DIR'
}
}


stage('L2: Eng alignment TN') {
steps {
sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \
sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkins/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \
cd nemo_text_processing/fst_alignment && python alignment.py --text="2615 Forest Av, 90501 CA, Santa Clara. 10kg, 12/16/2018" --grammar=tn --rule=tokenize_and_classify --fst=${EN_TN_CACHE}/en_tn_True_deterministic_cased__tokenize.far 2>&1 | tee $NORM_OUTPUT_DIR/pred.txt && \
cmp --silent $NORM_OUTPUT_DIR/pred.txt /home/jenkinsci/TestData/text_norm/ci/alignment_gold.txt || exit 1 && \
cmp --silent $NORM_OUTPUT_DIR/pred.txt /home/jenkins/TestData/text_norm/ci/alignment_gold.txt || exit 1 && \
rm -rf $NORM_OUTPUT_DIR'
}
}

stage('L2: Eng alignment ITN') {
steps {
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkinsci/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \
sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkins/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \
cd nemo_text_processing/fst_alignment && python alignment.py --text="one million twenty three thousand two hundred eleven ten kilograms one hundred twenty three dollars and twenty five cents" --grammar=itn --rule=tokenize_and_classify --fst=${EN_TN_CACHE}/en_itn_lower_cased.far 2>&1 | tee $DENORM_OUTPUT_DIR/pred.txt && \
cmp --silent $DENORM_OUTPUT_DIR/pred.txt /home/jenkinsci/TestData/text_denorm/ci/alignment_gold.txt || exit 1 && \
cmp --silent $DENORM_OUTPUT_DIR/pred.txt /home/jenkins/TestData/text_denorm/ci/alignment_gold.txt || exit 1 && \
rm -rf $DENORM_OUTPUT_DIR'
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ $ đô la mỹ
₩ won
₩ uôn
RM ringgit
₫ đồng
£ bảng anh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
- gạch
_ gạch dưới
_ shift gạch
_ shift trừ
_ síp gạch
! chấm than
# thăng
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
. chấm
- gạch
- gạch ngang
_ gạch dưới
_ shift gạch
_ shift trừ
_ síp gạch
/ sẹc
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
NEMO_SPACE = " "
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize()
NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, '"').optimize()

NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize()
NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize()
Expand Down
Loading