# Create a Turkish Language model for STT
## Step #2 : Language Model Generation Phase
Based on: https://github.com/ftyers/commonvoice-docker/blob/main/lm.sh

## Mount Google Drive

In [None]:
# mount your private google drive
from google.colab import drive
import shutil
drive.mount('/content/drive')

Mounted at /content/drive


## Basic Setup

In [None]:
# Install Coqui STT 
!git clone --depth 1 --branch v1.0.0 https://github.com/coqui-ai/STT.git
!cd STT; pip install -U pip wheel setuptools; pip install .

Cloning into 'STT'...
remote: Enumerating objects: 2162, done.[K
remote: Counting objects: 100% (2162/2162), done.[K
remote: Compressing objects: 100% (1358/1358), done.[K
remote: Total 2162 (delta 847), reused 1695 (delta 709), pack-reused 0[K
Receiving objects: 100% (2162/2162), 12.49 MiB | 21.31 MiB/s, done.
Resolving deltas: 100% (847/847), done.
Note: checking out '27584037f879442fb45f9064dc772dbcb6ba6372'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by performing another checkout.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -b with the checkout command again. Example:

  git checkout -b <new-branch-name>

Collecting pip
  Downloading pip-21.3.1-py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 7.2 MB/s 
Collecting setuptools
  Downloading setuptools-59.

In [None]:
# Get KenLM
!git clone https://github.com/kpu/kenlm.git && cd kenlm && mkdir build && cd build/ && cmake .. && make -j 4

Cloning into 'kenlm'...
remote: Enumerating objects: 14051, done.[K
remote: Counting objects: 100% (364/364), done.[K
remote: Compressing objects: 100% (296/296), done.[K
remote: Total 14051 (delta 109), reused 121 (delta 55), pack-reused 13687[K
Receiving objects: 100% (14051/14051), 5.76 MiB | 16.24 MiB/s, done.
Resolving deltas: 100% (7989/7989), done.
-- The C compiler identification is GNU 7.5.0
-- The CXX compiler identification is GNU 7.5.0
-- Check for working C compiler: /usr/bin/cc
-- Check for working C compiler: /usr/bin/cc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Could NOT find Eigen3 (missing: Eigen3_DI

## Directory Structure

In [None]:
# Copy corpus data from drive
!mkdir -p /content/data/tr/lm

## Generate Language Model

In [None]:
!python3 /content/STT/data/lm/generate_lm.py \
  --input_txt /content/drive/MyDrive/cv-datasets/tr/language_model/corpus/corpus.tar.gz \
  --output_dir /content/data/tr/lm/ \
  --top_k 500000 \
  --discount_fallback \
  --kenlm_bins /content/kenlm/build/bin/ \
  --arpa_order 5 \
  --max_arpa_memory "85%" \
  --arpa_prune "0|0|1" \
  --binary_a_bits 255 \
  --binary_q_bits 8 \
  --binary_type trie


Converting to lowercase and counting word occurrences ...
| |      #                                     | 169045510 Elapsed Time: 0:30:57

Saving top 500000 words ...

Calculating word statistics ...
  Your text file has 686554726 words in total
  It has 2229955 unique words
  Your top-500000 words are 99.0424 percent of all words
  Your most common word "bir" occurred 18607643 times
  The least common word in your top-k is "arkadaşımlayken" with 17 times
  The first word with 18 occurrences is "isirika" at place 486792

Creating ARPA file ...
=== 1/5 Counting and sorting n-grams ===
Reading /content/data/tr/lm/lower.txt.gz
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
tcmalloc: large alloc 2038890496 bytes == 0x55b91bb62000 @  0x7fea04e931e7 0x55b91ac577a2 0x55b91abf251e 0x55b91abd12eb 0x55b91abbd066 0x7fea0302cbf7 0x55b91abbebaa
tcmalloc: large alloc 9514811392 bytes == 0x55b9953d2000 @  0x7fea04e931e7 0x55b91ac577a2 0x55b91ac4

## Save Results

In [None]:
!ls -al /content/data/tr/lm/

total 1228736
drwxr-xr-x 2 root root       4096 Dec  4 12:22 .
drwxr-xr-x 3 root root       4096 Dec  4 09:40 ..
-rw-r--r-- 1 root root 1252410121 Dec  4 12:22 lm.binary
-rw-r--r-- 1 root root    5796451 Dec  4 11:38 vocab-500000.txt


In [None]:
# Compress
#!tar czf /content/data/lm.tar.gz /content/data/tr/lm
# Copy file to Google Drive.
shutil.move("/content/data/tr/lm/lm.binary", "/content/drive/MyDrive/cv-datasets/tr/language_model/lm")
shutil.move("/content/data/tr/lm/vocab-500000.txt", "/content/drive/MyDrive/cv-datasets/tr/language_model/lm")

tar: Removing leading `/' from member names


'/content/drive/MyDrive/cv-datasets/tr/lm.tar.gz'

In [None]:
# Flush disk to Google Drive
drive.flush_and_unmount()