# Segmentation

## Using BPE (Byte Pair Encoding)

In [None]:
! pip install subword-nmt
! pip install nltk

### Tokenization

In [None]:
import nltk
nltk.download('punkt') # download tokenizer

with open("data/europarl-v9.fi-en.tsv", "r", encoding="utf-8") as f:
   file = f.read() # read file

lines = file.splitlines() # split by line
tokized_lines = [nltk.word_tokenize(line) for line in lines] # tokenize each line

tokenized_text = [" ".join(tokens) for tokens in tokized_lines] # join tokens with space

with open("data/tokenized-europarl-v9.fi-en", "w", encoding="utf-8") as f:
   f.write("\n".join(tokenized_text)) # write to file


### Full dataset

In [None]:
! subword-nmt learn-bpe -s 10000 < data/europarl-v9.fi-en.tsv > data/europarl-v9.fi-en.codes
! subword-nmt apply-bpe -c data/europarl-v9.fi-en.codes < data/europarl-v9.fi-en.tsv > data/europarl-v9.fi-en.subword

### Only first 10000 lines

In [None]:
! head -n 10000 data/tokenized-europarl-v9.fi-en > data/tokenized-europarl-v9.fi-en-10000
! subword-nmt learn-bpe -s 10000 < data/tokenized-europarl-v9.fi-en-10000 > data/tokenized-europarl-v9.fi-en-10000.codes
! subword-nmt apply-bpe -c data/tokenized-europarl-v9.fi-en-10000.codes < data/tokenized-europarl-v9.fi-en-10000 > data/tokenized-europarl-v9.fi-en-10000.subword


## Using Morfessor

In [None]:
! pip install morfessor
! pip install nltk

### Tokenization

In [None]:
import nltk
nltk.download('punkt') # download tokenizer

with open("data/europarl-v9.fi", "r", encoding="utf-8") as f:
	file = f.read() # read file

lines = file.splitlines() # split by line
tokized_lines = [nltk.word_tokenize(line) for line in lines] # tokenize each line

tokenized_text = [" ".join(tokens) for tokens in tokized_lines] # join tokens with space

with open("data/tokenized-europarl-v9.fi", "w", encoding="utf-8") as f:
	f.write("\n".join(tokenized_text)) # write to file

### Full dataset

In [None]:
! morfessor -t data/tokenized-europarl-v9.fi -s data/model.bin
! morfessor -l data/model.bin -T - < data/tokenized-europarl-v9.fi > data/tokenized-europarl-v9.fi.segmented
! morfessor -l data/model.bin -T - --output-newlines --output-format "{analysis}  " --output-format-separator "@@ " < data/tokenized-europarl-v9.fi > data/tokenized-europarl-v9.fi.segmented

### Only first 10000 lines

In [None]:
! head -n 10000 data/tokenized-europarl-v9.fi > data/tokenized-europarl-v9.fi-10000
! morfessor -t data/tokenized-europarl-v9.fi-10000 -s data/model.bin
! morfessor -l data/model.bin -T - < data/tokenized-europarl-v9.fi-10000 > data/tokenized-europarl-v9.fi-10000.segmented
! morfessor -l data/model.bin -T - --output-newlines --output-format "{analysis}  " --output-format-separator "@@ " < data/tokenized-europarl-v9.fi-10000 > data/tokenized-europarl-v9.fi-10000.segmented