# <h1> A COLAB notebook for italian DeepSpeech model </h1>

## Install all the needed dependencies

In [None]:
# shortcut for /root folder
H = %env HOME
%cd $H

In [None]:
%env DS_VERSION 0.8.0
# Get DeepSpeech

%cd $H
!git clone https://github.com/mozilla/DeepSpeech.git ./ds
%cd ds
!git checkout f56b07dab4542eecfb72e059079db6c2603cc0ee

In [None]:
# Tell colab to use TF 1.x version and then install DS dependencies
%tensorflow_version 1.x
!pip3 install --no-cache-dir --upgrade pip==20.0.2 wheel==0.34.2 setuptools==46.1.3
!DS_NOTENSORFLOW=y pip3 install --no-cache-dir --upgrade -e .
!apt update && apt install -y --no-install-recommends sox libsox-fmt-mp3 pixz && apt autoremove && apt clean

In [None]:
# simple check before going on
!./bin/run-tc-ldc93s1_new.sh 2 16000

# this one need the kenlm.scorer so you need to install git-lfs and do a 
# git-fls pull in the deepspeech repo
# !./bin/run-ldc93s1.sh 

In [None]:
#the italian alphabet
!mkdir -p /mnt/models
!wget -O "/mnt/models/alphabet.txt" https://github.com/MozillaItalia/DeepSpeech-Italian-Model/raw/master/DeepSpeech/italian_alphabet.txt

## tinyCV-IT, CV-IT and MAILABS dataset: uncomment and choose your fighter(s)!

**WARNING: always take a look to disk space availability**

In [None]:
# Uncomment the dataset you need. Don't use tinyCV-IT and CV-IT together :)
# Keep in mind that all these stuff decompressed takes around 30GB
# Note: english compatibility is not handled right now

#tinyCV-IT (just for testing)
'''

!mkdir -p /mnt/extracted/data/cv-it_tiny
%env CV_TINY_PATH /mnt/extracted/data/cv-it_tiny
!wget -O - https://github.com/MozillaItalia/DeepSpeech-Italian-Model/files/4610711/cv-it_tiny.tar.gz | tar -zxv -C $CV_TINY_PATH

'''


#MAILABS
'''

%cd $H/ds
# Download and prepare M-AILABS
!python bin/import_m-ailabs.py ${IMPORT_AS_ENGLISH} \
  --filter_alphabet /mnt/models/alphabet.txt \
  --language it_IT                           \
  /mnt/extracted/data/M-AILABS/
# free some space removing the MAILABS tgz
!rm /mnt/extracted/data/M-AILABS/it_IT.tgz

'''

# CV-IT
'''
%cd $H/ds
# Download CV
!mkdir -p /mnt/sources
!wget https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-5.1-2020-06-22/it.tar.gz -O /mnt/sources/it.tar.gz
# Prepare CV
!mkdir -p /mnt/extracted/data/cv-it/
!tar -C /mnt/extracted/data/cv-it/ --strip-components=2 -xf /mnt/sources/it.tar.gz
# FIX STEREO FILES
%cd /mnt/extracted/data/cv-it/clips
!mv common_voice_it_21431109.mp3 common_voice_it_21431109_.mp3
!mv common_voice_it_21431655.mp3 common_voice_it_21431655_.mp3
!sox common_voice_it_21431109_.mp3 common_voice_it_21431109.mp3 remix 1,2
!sox common_voice_it_21431655_.mp3 common_voice_it_21431655.mp3 remix 1,2
!rm common_voice_it_21431109_.mp3 common_voice_it_21431655_.mp3
# START MP3->WAV CONVERSION AND CSV PREPARATION
%cd $H/ds
!python bin/import_cv2.py --filter_alphabet=/mnt/models/alphabet.txt /mnt/extracted/data/cv-it/

# after mp3->wav conversion we can remove all mp3 files
%cd /mnt/extracted/data/cv-it/clips
!find . -name "*.mp3" -type f|xargs rm -f
# remove the CV tar
!rm /mnt/sources/it.tar.gz
'''

In [None]:
# Run this if you need some space
# !rm -rf /swift/*
# !pip uninstall -y torch
!rm -rf $H/.cache/pip/*
!rm -rf $H/DeepSpeech-Italian-Model/.git/*
!rm -rf $H/ds/.git/*
!rm -rf $H/kenlm/.git/*
!rm -rf /content/sample_data/*

## **Setup your google drive now!**

Please, before running other cells, export your google drive path to store your model checkpoints.

Probably you'll need more space than colab offers


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:


# Edit here your base google drive path (pay attention to spaces in your path, eg "My Drive".)
%env GDRIVE_PATH /content/drive/My Drive/DeepSpeech



In [None]:
# Some paths
import os
# Customize them based on your needs!

paths2print = []

ALPHABET_CONFIG_PATH="/mnt/models/alphabet.txt"
paths2print.append(ALPHABET_CONFIG_PATH)

SAVE_CHECKPOINT_DIR = os.path.join(os.environ.get("GDRIVE_PATH"),"ckpts/ita/deepspeech-"+os.environ.get("DS_VERSION")+"-checkpoint")
paths2print.append(SAVE_CHECKPOINT_DIR)
LOAD_CHECKPOINT_DIR = os.path.join(os.environ.get("GDRIVE_PATH"),"ckpts/eng/deepspeech-"+os.environ.get("DS_VERSION")+"-checkpoint")
paths2print.append(LOAD_CHECKPOINT_DIR)

# when you set 2 different folders during training, deepspeech will warn you that it will
# be impossibile to evaluate the model with the test dataset.
CHECKPOINT_DIR = SAVE_CHECKPOINT_DIR
paths2print.append(CHECKPOINT_DIR)

#IMPORTANT! Copy your own scorer in SCORER path!
SCORER = os.path.join(os.environ.get("GDRIVE_PATH"),"0.8/kenlm.scorer")
paths2print.append(SCORER)

EXPORT_FOLDER = os.path.join(os.environ.get("GDRIVE_PATH"),"0.8")
paths2print.append(EXPORT_FOLDER)

# dir for tensorboard and logs
SUMMARY_DIR = os.path.join(os.environ.get("GDRIVE_PATH"),"0.8/logs")
paths2print.append(SUMMARY_DIR)

# just for a quick check 
for p in paths2print:
  print(p)

In [None]:
# put here the DS english checkpoints to load model weights during transfer learning
!mkdir -p "$LOAD_CHECKPOINT_DIR"
# in this directory will be saved the italian checkpoints during the training phase
!mkdir -p "$SAVE_CHECKPOINT_DIR"
# here you can find all the logs files for tensorboard
!mkdir -p "$SUMMARY_DIR"
# print all directories created
!du -a "$GDRIVE_PATH" 

# '''
# example:

# base dir: /content/drive/My Drive/DeepSpeech
# eng checkpoints: /content/drive/My Drive/DeepSpeech/ckpts/eng/deepspeech-0.8.0-checkpoint
# ita checkpoints: /content/drive/My Drive/DeepSpeech/ckpts/ita/deepspeech-0.8.0-checkpoint
# scorer: /content/drive/My Drive/DeepSpeech/kenlm.scorer

# '''

# Training

### Setup all needed params and paths


In [None]:
'''
PARAM VALUES ARE SET FOR THE TINY DATASET!
'''

# BATCH SIZE = 2 for the tiny dataset!
BATCH_SIZE=2 #128 from latest release, 64 to avoid out of memory errors ;)
N_HIDDEN=2048
# EPOCHS = 2 JUST FOR THE TINY DATASET!
EPOCHS=2 #30
LEARNING_RATE=0.0001
DROPOUT=0.4 # from latest release
# LM_ALPHA = None # 0.931289039105002 # from latest release
# LM_BETA= None # 1.1834137581510284 # from latest release
# BEAM_WIDTH=500


do_early_stop= True
EARLY_STOP_FLAG="--noearly_stop"
if do_early_stop:
  print("do early stop")
  EARLY_STOP_FLAG="--early_stop --es_epochs 10"


#transfer params
DROP_SOURCE_LAYERS = 1

#other flags
MAX_TO_KEEP = 3 # max nr of checkpoints to keep

AMP_FLAG="--automatic_mixed_precision true"  

!mkdir -p /mnt/sources/feature_cache || true

In [None]:
# Train, dev and test as list of path to .csv files
# Note: each dataset needs to be located under /mnt/extracted/data

all_train_csv=!(find /mnt/extracted/data/ -type f -name '*train.csv' -printf '%p,' | sed -e 's/,$//g')
all_dev_csv=!(find /mnt/extracted/data/ -type f -name '*dev.csv' -printf '%p,' | sed -e 's/,$//g')
all_test_csv=!(find /mnt/extracted/data/ -type f -name '*test.csv' -printf '%p,' | sed -e 's/,$//g')
ALL_TRAIN_CSV=all_train_csv[0]
ALL_DEV_CSV=all_dev_csv[0]
ALL_TEST_CSV=all_test_csv[0]


In [None]:
# Build the params string for DeepSpeech.py

'''
Note if your paths contain spaces just wrap them inside " "
eg:

--scorer "'+SCORER+'" \

'''

params = ""

# Using default lm_alpha and lm_beta 
train_params = ' \
--summary_dir "'+SUMMARY_DIR+'" \
--log_dir "'+SUMMARY_DIR+'" \
--alphabet_config_path '+ALPHABET_CONFIG_PATH+' \
--checkpoint_dir "'+CHECKPOINT_DIR+'" \
--show_progressbar true \
--train_cudnn True \
--scorer "'+SCORER+'" \
--train_files '+ALL_TRAIN_CSV+' \
--dev_files '+ALL_DEV_CSV+' \
--train_batch_size '+str(BATCH_SIZE)+' \
--dev_batch_size '+str(BATCH_SIZE)+' \
--n_hidden '+str(N_HIDDEN)+' \
--epochs '+str(EPOCHS)+' \
--learning_rate '+str(LEARNING_RATE)+' \
--dropout_rate '+str(DROPOUT)+' \
--max_to_keep '+str(MAX_TO_KEEP)+' \
'+EARLY_STOP_FLAG

params = train_params

# If you dont want to use data augmentation, flag this on False
use_augmentation = False
# transfer learning on/off
do_transfer_learning = False


if do_transfer_learning:
  transfer_params = '\
  --drop_source_layers '+str(DROP_SOURCE_LAYERS)+'\
  --save_checkpoint_dir "'+SAVE_CHECKPOINT_DIR+'" \
  --load_checkpoint_dir "'+LOAD_CHECKPOINT_DIR+'"'

  params+=transfer_params
  AMP_FLAG=""
  !wget -O eng_checkpoints.tar.gz "https://github.com/mozilla/DeepSpeech/releases/download/v$DS_VERSION/deepspeech-$DS_VERSION-checkpoint.tar.gz"
  !tar -zxv -f eng_checkpoints.tar.gz --strip 1 -C "$LOAD_CHECKPOINT_DIR"
  !rm eng_checkpoints.tar.gz

if use_augmentation:
  augm = '\
  --feature_cache /mnt/sources/feature_cache \
  --cache_for_epochs 10 \
  --augment reverb[p=0.1,delay=50.0~30.0,decay=10.0:2.0~1.0] \
  --augment resample[p=0.1,rate=12000:8000~4000] \
  --augment codec[p=0.1,bitrate=48000:16000] \
  --augment volume[p=0.1,dbfs=-10:-40] \
  --augment pitch[p=0.1,pitch=1~0.2] \
  --augment tempo[p=0.1,factor=1~0.5] \
  --augment frequency_mask[p=0.1,n=1:3,size=1:5] \
  --augment time_mask[p=0.1,domain=signal,n=3:10~2,size=50:100~40] \
  --augment dropout[p=0.1,rate=0.05] \
  --augment add[p=0.1,domain=signal,stddev=0~0.5] \
  --augment multiply[p=0.1,domain=features,stddev=0~0.5]'
  params+=augm

params+=" "+AMP_FLAG
print(params)

### Start the training phase!


### WHILE TRAINING, REMEMBER TO FLUSH YOUR DRIVE TRASH FOLDER!
#### if you are on Linux you can use a combo of google-drive-ocamlfuse and "watch" command to empty the .Trash/ folder!


In [None]:
%cd $H/ds/
!set -xe
!python DeepSpeech.py $params

### Launch Tensorboard

In [None]:
%load_ext tensorboard
%tensorboard --logdir "$SUMMARY_DIR"

### LM OPTIMIZER


In [None]:
RUN_OPTMIZER = False

%cd $H/ds/
# From DS, default values: ALFA and BETA MAX=5 , N_TRIALS=2400
LM_ALPHA_MAX=5
LM_BETA_MAX=5
LM_N_TRIALS=600
# USING DEV SET FOR TUNING! TEST MUST BE THE LAST THING TO BE EVALUATED
opt_params=' \
--alphabet_config_path '+ALPHABET_CONFIG_PATH+' \
--checkpoint_dir "'+CHECKPOINT_DIR+'" \
--show_progressbar true \
--train_cudnn True \
'+AMP_FLAG+' \
--scorer "'+SCORER+'" \
--test_files '+ALL_DEV_CSV+' \
--test_batch_size '+str(BATCH_SIZE)+' \
--n_hidden '+str(N_HIDDEN)+' \
--n_trials '+str(LM_N_TRIALS)+' \
--lm_alpha_max '+str(LM_ALPHA_MAX)+' \
--lm_beta_max '+str(LM_BETA_MAX)+' \
--feature_cache /mnt/sources/feature_cache'

if RUN_OPTIMIZER:
    !python lm_optimizer.py $opt_params

### Evaluate

#### If you have run lm_optimizer, save the best ALPHA and BETA value and run again generate_scorer_package with those new values using --default_alpha and --default_beta flag

In [None]:
%cd $H/ds/
# After training/transfer/finetuning do some test here
test_params = ' \
--alphabet_config_path '+ALPHABET_CONFIG_PATH+' \
--checkpoint_dir "'+CHECKPOINT_DIR+'" \
--show_progressbar true \
--load_evaluate best \
--scorer "'+SCORER+'" \
--train_cudnn True \
--test_files '+ALL_TEST_CSV+' \
--test_batch_size '+str(BATCH_SIZE)

!python DeepSpeech.py $test_params

### Export the .pb file


In [None]:
%cd $H/ds/
#export .pb file
exp_params = ' \
--alphabet_config_path '+ALPHABET_CONFIG_PATH+' \
--checkpoint_dir "'+CHECKPOINT_DIR+'" \
--show_progressbar true \
--load_evaluate "best" \
--scorer "'+SCORER+'" \
--export_dir "'+EXPORT_FOLDER+'" \
--export_language "it" \
--verbosity 1'

!python -u DeepSpeech.py $exp_params

### Create the pbmm file


In [None]:
#lets create the pbmm format
!python util/taskcluster.py --source tensorflow --artifact convert_graphdef_memmapped_format --branch r1.15 --target .
!./convert_graphdef_memmapped_format --in_graph="$EXPORT_FOLDER"/output_graph.pb --out_graph="$EXPORT_FOLDER"/output_graph.pbmm

In [None]:
!python DeepSpeech.py --helpfull