# Create a Turkish Language model for STT
## Step #1 : Corpus Preparation Phase
Based on: https://github.com/ftyers/commonvoice-docker/blob/main/lm.sh

## Mount Google Drive

In [None]:
# mount your private google drive
from google.colab import drive
import shutil
drive.mount('/content/drive')

Mounted at /content/drive


## Basic Setup

## Get Data From Drive

In [None]:
# Common Voice Utils
!pip uninstall commonvoice-utils -y
!pip install git+https://github.com/ftyers/commonvoice-utils.git

Collecting git+https://github.com/ftyers/commonvoice-utils.git
  Cloning https://github.com/ftyers/commonvoice-utils.git to /tmp/pip-req-build-7fvt5280
  Running command git clone -q https://github.com/ftyers/commonvoice-utils.git /tmp/pip-req-build-7fvt5280
Building wheels for collected packages: commonvoice-utils
  Building wheel for commonvoice-utils (setup.py) ... [?25l[?25hdone
  Created wheel for commonvoice-utils: filename=commonvoice_utils-0.2.12-py3-none-any.whl size=169852 sha256=9f8d06c7e0ac17eabf568935a0a45ebf5ea9824422b04bc4a872ad5d4ee8166e
  Stored in directory: /tmp/pip-ephem-wheel-cache-csawsdmu/wheels/07/a4/99/bc54e7a34fd9ad46d6138ca5957074dc965cdf5b0ba3487d91
Successfully built commonvoice-utils
Installing collected packages: commonvoice-utils
Successfully installed commonvoice-utils-0.2.12


In [None]:
# Create directory and List Corpus data on the drive
!mkdir -p /content/data/tr/lm
!ls /content/drive/MyDrive/cv-datasets/tr/language_model/corpus

alphabet.txt	     list.txt		   ted2013.txt.gz
dev.csv		     opensubtitles.txt.gz  ted2020.txt.gz
globalvoices.txt.gz  tatoeba.txt.gz	   train.csv


## Process / combine the data

In [None]:
# Get from CV data
!covo text tr /content/drive/MyDrive/cv-datasets/tr/language_model/corpus/dev.csv /content/drive/MyDrive/cv-datasets/tr/language_model/corpus/train.csv > /content/data/tr/lm/corpus.txt
!echo -ne 'Transcripts:';
!wc /content/data/tr/lm/corpus.txt

Transcripts:  6855  36710 278741 /content/data/tr/lm/corpus.txt


In [None]:
# THE FOLLOWING TAKES TO LONG TO LOAD FROM INTERNET SOURCES, SO WE WILL PASS IT FROM GOOGLE DRIVE
# Get OPUS URLS
#!covo opus tr | grep -e Tatoeba -e OpenSubtitles -e TED -e GlobalVoices | cut -f2 > /content/data/tr/lm/urls.txt
#!cat /content/data/tr/lm/urls.txt
# Append OPUS
#!cat /content/data/tr/lm/urls.txt | xargs wget -O - | zcat | covo norm tr >> /content/data/tr/lm/corpus.txt

In [None]:
# Append Global Voices
!zcat /content/drive/MyDrive/cv-datasets/tr/language_model/corpus/globalvoices.txt.gz | covo norm tr >> /content/data/tr/lm/corpus.txt
!echo -ne 'Global Voices:';
!wc /content/data/tr/lm/corpus.txt

6118/8796 (69.55%)


In [None]:
# Append Tatoeba
!zcat /content/drive/MyDrive/cv-datasets/tr/language_model/corpus/tatoeba.txt.gz | covo norm tr >> /content/data/tr/lm/corpus.txt
!echo -ne 'Tatoeba:';
!wc /content/data/tr/lm/corpus.txt

730933/739384 (98.86%)
Tatobea: 168553241  681296126 4967667729 /content/data/tr/lm/corpus.txt


In [None]:
# Append TED2013
!zcat /content/drive/MyDrive/cv-datasets/tr/language_model/corpus/ted2013.txt.gz | covo norm tr >> /content/data/tr/lm/corpus.txt
!echo -ne 'TED2013:';
!wc /content/data/tr/lm/corpus.txt

121044/137077 (88.30%)
TED2013: 168674285  682703383 4978666303 /content/data/tr/lm/corpus.txt


In [None]:
# Append TED2020
!zcat /content/drive/MyDrive/cv-datasets/tr/language_model/corpus/ted2020.txt.gz | covo norm tr >> /content/data/tr/lm/corpus.txt
!echo -ne 'TED2020:';
!wc /content/data/tr/lm/corpus.txt

371225/428661 (86.60%)
TED2020: 169045510  686554723 5009117653 /content/data/tr/lm/corpus.txt


In [None]:
# Append OpenSubTitles
!zcat /content/drive/MyDrive/cv-datasets/tr/language_model/corpus/opensubtitles.txt.gz | covo norm tr >> /content/data/tr/lm/corpus.txt
!echo -ne 'OpenSubTitles:';
!wc /content/data/tr/lm/corpus.txt

167809335/173215557 (96.88%)
OpenSubTitles: 167822308  677689999 4941309186 /content/data/tr/lm/corpus.txt


In [None]:
# Show result
!echo -ne 'FINAL:';
!wc /content/data/tr/lm/corpus.txt

FINAL: 169045510  686554723 5009117653 /content/data/tr/lm/corpus.txt


In [None]:
!ls -al /content/data/tr/lm/corpus.txt

-rw-r--r-- 1 root root 5009117653 Dec  4 06:49 /content/data/tr/lm/corpus.txt


## Compress file and put back to drive

In [None]:
# Compress
!tar czf /content/data/corpus.tar.gz /content/data/tr/lm/corpus.txt
!ls -al /content/data/corpus.tar.gz

tar: Removing leading `/' from member names
-rw-r--r-- 1 root root 1344320482 Dec  4 06:59 /content/data/corpus.tar.gz


In [None]:
# Copy file to Google Drive.
shutil.move("/content/data/corpus.tar.gz", "/content/drive/MyDrive/cv-datasets/tr/language_model/corpus/corpus.tar.gz")

'/content/drive/MyDrive/cv-datasets/tr/corpus/corpus.tar.gz'

In [None]:
# Flush disk to Google Drive
drive.flush_and_unmount()