# <h1> A COLAB notebook for italian DeepSpeech model </h1>

## Install all the needed dependencies

In [1]:
# shortcut for /root folder
H = %env HOME
%cd $H

/root


In [2]:
%env DS_VERSION 0.8.0
# Get DeepSpeech

%cd $H
!git clone https://github.com/mozilla/DeepSpeech.git ./ds
%cd ds
!git checkout f56b07dab4542eecfb72e059079db6c2603cc0ee

/root
Cloning into './ds'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 18619 (delta 8), reused 8 (delta 2), pack-reused 18594[K
Receiving objects: 100% (18619/18619), 47.71 MiB | 26.37 MiB/s, done.
Resolving deltas: 100% (12675/12675), done.
/root/ds
Note: checking out '88584941bc2ff5b91d6b11ad0a6b85da391d626b'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by performing another checkout.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -b with the checkout command again. Example:

  git checkout -b <new-branch-name>

HEAD is now at 88584941 Merge pull request #3036 from lissyx/doc-fix


In [0]:
# needed for kenlm.scorer (eg  for bin/run-ldc93s1.sh ). Commented for space availability reason

# !apt-get install git-lfs
# !git-lfs pull

In [None]:
# Tell colab to use TF 1.x version and then install DS dependencies
%tensorflow_version 1.x
!pip3 install --no-cache-dir --upgrade pip==20.0.2 wheel==0.34.2 setuptools==46.1.3
!pip3 install --no-cache-dir --upgrade -e .
!apt update
!apt-get install sox libsox-fmt-mp3 pixz

In [None]:
# simple check before going on
!./bin/run-tc-ldc93s1_new.sh 2 16000

# this one need the kenlm.scorer so you need to install git-lfs and do a 
# git-fls pull in the deepspeech repo
# !./bin/run-ldc93s1.sh 

In [None]:
#extract the tiny cv sample dataset
!mkdir -p /mnt/extracted/data/cv-it_tiny
%env CV_TINY_PATH /mnt/extracted/data/cv-it_tiny
!wget -O - https://github.com/MozillaItalia/DeepSpeech-Italian-Model/files/4610711/cv-it_tiny.tar.gz | tar -zxv -C $CV_TINY_PATH

#and the italian alphabet
!mkdir -p /mnt/models
!wget -O "/mnt/models/alphabet.txt" https://github.com/MozillaItalia/DeepSpeech-Italian-Model/raw/master/DeepSpeech/italian_alphabet.txt

## CV-IT and MAILABS complete dataset

**WARNING: the remaining space disk will not be enough to save training checkpoints**

In [None]:
# Uncomment here if you want the CV-IT and MAILABS complete datasets
# Keep in mind that all these stuff decompressed takes around 30GB
# Note: english compatibility is not handled right now

#MAILABS
'''

%cd $H/ds
# Download and prepare M-AILABS
!python bin/import_m-ailabs.py ${IMPORT_AS_ENGLISH} \
  --filter_alphabet /mnt/models/alphabet.txt \
  --language it_IT                           \
  /mnt/extracted/data/M-AILABS/
# free some space removing the MAILABS tgz
!rm /mnt/extracted/data/M-AILABS/it_IT.tgz

'''

# CV-IT
'''
%cd $H/ds
# Download CV
!mkdir -p /mnt/sources
!wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/it.tar.gz -O /mnt/sources/it.tar.gz
# Prepare CV
!mkdir -p /mnt/extracted/data/cv-it/
!tar -C /mnt/extracted/data/cv-it/ -xf /mnt/sources/it.tar.gz
!python bin/import_cv2.py --filter_alphabet=/mnt/models/alphabet.txt /mnt/extracted/data/cv-it/
# free some space again
!rm /mnt/sources/it.tar.gz

'''

In [0]:
# Run this if you need some space
!rm -rf /swift/*
!pip uninstall -y torch
!rm -rf $H/.cache/pip/*
!rm -rf $H/DeepSpeech-Italian-Model/.git/*
!rm -rf $H/ds/.git/*
!rm -rf $H/kenlm/.git/*
!rm -rf /content/sample_data/*

Found existing installation: torch 1.5.0+cu101
Uninstalling torch-1.5.0+cu101:
  Successfully uninstalled torch-1.5.0+cu101


## **Setup your google drive now!**

Please, before running other cells, export your google drive path to store your model checkpoints.

Probably you'll need more space than colab offers


In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Save your google drive path
%env GDRIVE_PATH /your/path/to/drive

# Training

### Download DeepSpeech checkpoints


In [None]:
!wget -O - "https://github.com/mozilla/DeepSpeech/releases/download/v$DS_VERSION/deepspeech-$DS_VERSION-checkpoint.tar.gz" | tar -zxv -C "$GDRIVE_PATH/$DS_VERSION/transfer_ckpts/eng"

### Setup all needed params and paths


In [39]:
import os
# lets put some model params

BATCH_SIZE=2 #128 from 0.7 release
N_HIDDEN=2048
EPOCHS=30
LEARNING_RATE=0.0001
DROPOUT=0.4 # from 0.7 release
LM_ALPHA = None # 0.931289039105002 # from 0.7 release
LM_BETA= None # 1.1834137581510284 # from 0.7 release
# BEAM_WIDTH=500
do_early_stop= True
use_amp= False # DS checkpoint are not compatible with AMP

#transfer params
DROP_SOURCE_LAYERS = 1

#other flags
MAX_TO_KEEP = 3 # max nr of checkpoints to keep

EARLY_STOP_FLAG="--noearly_stop"
if do_early_stop:
  print("do early stop")
  EARLY_STOP_FLAG="--early_stop"

AMP_FLAG=""
if use_amp:
  print("use automatic mixed precision")
  AMP_FLAG="--automatic_mixed_precision true"

!mkdir -p /mnt/sources/feature_cache || true

do early stop


In [None]:
# Some paths

# Customize them based on your needs!

paths2print = []

ALPHABET_CONFIG_PATH="/mnt/models/alphabet.txt"
paths2print.append(ALPHABET_CONFIG_PATH)

SAVE_CHECKPOINT_DIR = os.path.join(os.environ.get("GDRIVE_PATH"),"0.7/transfer_ckpts/ita")
paths2print.append(SAVE_CHECKPOINT_DIR)
LOAD_CHECKPOINT_DIR = os.path.join(os.environ.get("GDRIVE_PATH"),"0.7/transfer_ckpts/eng/deepspeech-"+os.environ.get("DS_VERSION")+"-checkpoint")
paths2print.append(LOAD_CHECKPOINT_DIR)

# when you set 2 different dir during training, deepspeech will warn you that it will
# be impossibile to evaluate the model with the test dataset.
CHECKPOINT_DIR = SAVE_CHECKPOINT_DIR
paths2print.append(CHECKPOINT_DIR)

SCORER = os.path.join(os.environ.get("GDRIVE_PATH"),"0.7/kenlm.scorer")
paths2print.append(SCORER)

EXPORT_FOLDER = os.path.join(os.environ.get("GDRIVE_PATH"),"0.7")
paths2print.append(EXPORT_FOLDER)

# dir for tensorboard and logs
SUMMARY_DIR = os.path.join(os.environ.get("GDRIVE_PATH"),"0.7/logs")
paths2print.append(SUMMARY_DIR)

# just for a quick check 
for p in paths2print:
  print(p)

In [0]:
# Train, dev and test as list of path to .csv files
# Note: each dataset needs to be located under /mnt/extracted/data

all_train_csv=!(find /mnt/extracted/data/ -type f -name '*train.csv' -printf '%p,' | sed -e 's/,$//g')
all_dev_csv=!(find /mnt/extracted/data/ -type f -name '*dev.csv' -printf '%p,' | sed -e 's/,$//g')
all_test_csv=!(find /mnt/extracted/data/ -type f -name '*test.csv' -printf '%p,' | sed -e 's/,$//g')
ALL_TRAIN_CSV=all_train_csv[0]
ALL_DEV_CSV=all_dev_csv[0]
ALL_TEST_CSV=all_test_csv[0]


In [None]:
# Build the params string for DeepSpeech.py

'''
Note if your paths contain spaces just wrap them inside " "
eg:

--scorer "'+SCORER+'" \

'''

params = ""

# Using default lm_alpha and lm_beta 
train_params = ' \
--summary_dir "'+SUMMARY_DIR+'" \
--log_dir "'+SUMMARY_DIR+'" \
--alphabet_config_path '+ALPHABET_CONFIG_PATH+' \
--checkpoint_dir "'+CHECKPOINT_DIR+'" \
--show_progressbar true \
--train_cudnn True \
'+AMP_FLAG+' \
--scorer "'+SCORER+'" \
--train_files '+ALL_TRAIN_CSV+' \
--dev_files '+ALL_DEV_CSV+' \
--train_batch_size '+str(BATCH_SIZE)+' \
--dev_batch_size '+str(BATCH_SIZE)+' \
--n_hidden '+str(N_HIDDEN)+' \
--epochs '+str(EPOCHS)+' \
--learning_rate '+str(LEARNING_RATE)+' \
--dropout_rate '+str(DROPOUT)+' \
--max_to_keep '+str(MAX_TO_KEEP)+' \
'+EARLY_STOP_FLAG

params = train_params

# If you dont want to use data augmentation, flag this on False
use_augmentation = True
# transfer learning on/off
do_transfer_learning = True


if do_transfer_learning:
  transfer_params = '\
  --drop_source_layers '+str(DROP_SOURCE_LAYERS)+'\
  --save_checkpoint_dir "'+SAVE_CHECKPOINT_DIR+'" \
  --load_checkpoint_dir "'+LOAD_CHECKPOINT_DIR+'"'

  params+=transfer_params

if use_augmentation:
  augm = '\
  --feature_cache /mnt/sources/feature_cache \
  --cache_for_epochs 10 \
  --augment reverb[p=0.1,delay=50.0~30.0,decay=10.0:2.0~1.0] \
  --augment resample[p=0.1,rate=12000:8000~4000] \
  --augment codec[p=0.1,bitrate=48000:16000] \
  --augment volume[p=0.1,dbfs=-10:-40] \
  --augment pitch[p=0.1,pitch=1~0.2] \
  --augment tempo[p=0.1,factor=1~0.5] \
  --augment frequency_mask[p=0.1,n=1:3,size=1:5] \
  --augment time_mask[p=0.1,domain=signal,n=3:10~2,size=50:100~40] \
  --augment dropout[p=0.1,rate=0.05] \
  --augment add[p=0.1,domain=signal,stddev=0~0.5] \
  --augment multiply[p=0.1,domain=features,stddev=0~0.5]'
  params+=augm

print(params)

### Start the training phase!


In [None]:
%cd $H/ds/
!set -xe
!python DeepSpeech.py $params

### Launch Tensorboard

In [None]:
%load_ext tensorboard
%tensorboard --logdir "$SUMMARY_DIR"

### Evaluate


In [None]:
%cd $H/ds/
# After trainining/transfer/finetuning do some test here
# NOTE: SOO SLOW ..or maybe not? 
test_params = ' \
--alphabet_config_path '+ALPHABET_CONFIG_PATH+' \
--checkpoint_dir "'+CHECKPOINT_DIR+'" \
--show_progressbar true \
--load_evaluate best \
--scorer "'+SCORER+'" \
--train_cudnn True \
--test_files '+ALL_TEST_CSV+' \
--test_batch_size '+str(BATCH_SIZE)+' \
--verbosity 2'

!python DeepSpeech.py $test_params

### Export the .pb file


In [0]:
%cd $H/ds/
#export .pb file
exp_params = ' \
--alphabet_config_path '+ALPHABET_CONFIG_PATH+' \
--checkpoint_dir "'+CHECKPOINT_DIR+'" \
--show_progressbar true \
--load_evaluate "best" \
--scorer "'+SCORER+'" \
--lm_alpha '+str(LM_ALPHA)+' \
--lm_beta '+str(LM_BETA)+' \
--export_dir "'+EXPORT_FOLDER+'" \
--export_language "it" \
--verbosity 1'

!python -u DeepSpeech.py $exp_params

### Create the pbmm file


In [0]:
#lets create the pbmm format
!python util/taskcluster.py --source tensorflow --artifact convert_graphdef_memmapped_format --branch r1.15 --target .
!./convert_graphdef_memmapped_format --in_graph="$EXPORT_FOLDER"/output_graph.pb --out_graph="$EXPORT_FOLDER"/output_graph.pbmm

Downloading https://community-tc.services.mozilla.com/api/index/v1/task/project.deepspeech.tensorflow.pip.r1.15.cpu/artifacts/public/convert_graphdef_memmapped_format ...
Downloading: 100%



In [0]:
!python DeepSpeech.py --helpfull