Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 331fdb8
Showing
6 changed files
with
1,345 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
|
||
# OpenTrans | ||
|
||
Tools for open translation services | ||
|
||
* based on MarianNMT | ||
* trained on OPUS data | ||
|
||
|
||
|
||
## Setup | ||
|
||
|
||
* Start a marian-server with the model that you want to support | ||
|
||
``` | ||
~/marian/build/marian-server -p 11111 -b2 -n1 -m /media/letsmt/nmt/models/de+fr+sv+en-et+hu+fi/opus-wmt.bpe32k-bpe32k.enfi.transformer.model1.npz.best-perplexity.npz -v /media/letsmt/nmt/models/de+fr+sv+en-et+hu+fi/opus-wmt.bpe32k-bpe32k.enfi.vocab.yml /media/letsmt/nmt/models/de+fr+sv+en-et+hu+fi/opus-wmt.bpe32k-bpe32k.enfi.vocab.yml | ||
``` | ||
|
||
* edit the translation server script and start it as well | ||
# access from a client | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#!/usr/bin/env python | ||
|
||
from __future__ import print_function, unicode_literals, division | ||
|
||
import sys | ||
import time | ||
import argparse | ||
|
||
from websocket import create_connection | ||
|
||
|
||
if __name__ == "__main__": | ||
# handle command-line options | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-b", "--batch-size", type=int, default=1) | ||
parser.add_argument("-p", "--port", type=int, default=8080) | ||
args = parser.parse_args() | ||
|
||
# open connection | ||
ws = create_connection("ws://86.50.168.81:{}/translate".format(args.port)) | ||
|
||
count = 0 | ||
batch = "" | ||
for line in sys.stdin: | ||
count += 1 | ||
batch += line.decode('utf-8') if sys.version_info < (3, 0) else line | ||
if count == args.batch_size: | ||
# translate the batch | ||
ws.send(batch) | ||
result = ws.recv() | ||
print(result.rstrip()) | ||
|
||
count = 0 | ||
batch = "" | ||
|
||
if count: | ||
# translate the remaining sentences | ||
ws.send(batch) | ||
result = ws.recv() | ||
print(result.rstrip()) | ||
|
||
# close connection | ||
ws.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
#!/usr/bin/env python3 | ||
#-*-python-*- | ||
# | ||
# | ||
|
||
import codecs | ||
import pycld2 as cld2 | ||
from mosestokenizer import * | ||
from websocket import create_connection | ||
|
||
from apply_bpe import BPE | ||
|
||
|
||
##################################################################### | ||
#### TODO: all of this should be handled with command-line options! | ||
|
||
port = 8080 | ||
|
||
## specify the server that runs marian-decoder | ||
marian_server = 'localhost' | ||
marian_port = 11111 | ||
|
||
## languages that can be translated from and translated into | ||
srclangs = ['de','fr','sv','en'] | ||
trglangs = ['et','hu','fi'] | ||
default_trg = 'fi' | ||
|
||
## BPE model for pre-processing | ||
BPEmodel = '/media/letsmt/nmt/models/de+fr+sv+en-et+hu+fi/opus-wmt.src.bpe32k-model' | ||
BPEcodes = codecs.open(BPEmodel, encoding='utf-8') | ||
bpe = BPE(BPEcodes) | ||
|
||
################################################################ | ||
|
||
|
||
## pre- and post-processing tools | ||
tokenizer = {} | ||
sentence_splitter = {} | ||
normalizer = {} | ||
detokenizer = {} | ||
|
||
for l in srclangs: | ||
sentence_splitter[l] = MosesSentenceSplitter(l) | ||
normalizer[l] = MosesPunctuationNormalizer(l) | ||
tokenizer[l] = MosesTokenizer(l) | ||
|
||
for l in trglangs: | ||
detokenizer[l] = MosesDetokenizer(l) | ||
|
||
|
||
# open connection | ||
ws = create_connection("ws://{}:{}/translate".format(marian_server, marian_port)) | ||
|
||
|
||
from SimpleWebSocketServer import SimpleWebSocketServer, WebSocket | ||
|
||
class Translate(WebSocket): | ||
|
||
def handleMessage(self): | ||
|
||
fromLang = None | ||
toLang = default_trg | ||
prefix = '' | ||
|
||
## check whether the first token specifies the language pair | ||
tokens = self.data.split() | ||
langs = tokens.pop(0).split('-') | ||
if len(langs) == 2: | ||
toLang = langs[1] | ||
if langs[0] != 'DL': | ||
fromLang = langs[0] | ||
self.data = ' '.join(tokens) | ||
|
||
if len(trglangs) > 1: | ||
prefix = '>>' + toLang + '<< ' | ||
|
||
if not fromLang: | ||
isReliable, textBytesFound, details = cld2.detect(self.data, bestEffort=True) | ||
fromLang = details[0][1] | ||
print("language detected = " + fromLang) | ||
|
||
if not fromLang in srclangs: | ||
print('unsupported source language ' + fromLang) | ||
self.sendMessage('ERROR: unsupported source language ' + fromLang) | ||
return | ||
|
||
if not toLang in trglangs: | ||
print('unsupported target language ' + toLang) | ||
self.sendMessage('ERROR: unsupported target language ' + toLang) | ||
return | ||
|
||
message = [] | ||
for s in sentence_splitter[fromLang]([normalizer[fromLang](self.data)]): | ||
# print(s) | ||
tokenized = ' '.join(tokenizer[fromLang](s)) | ||
# print(tokenized) | ||
segmented = bpe.process_line(tokenized) | ||
# print(prefix + segmented) | ||
ws.send(prefix + segmented) | ||
translated = ws.recv().replace('@@ ','') | ||
# print(translated) | ||
detokenized = detokenizer[toLang](translated.split()) | ||
print('TRANSLATION: ' + detokenized) | ||
message.append(detokenized) | ||
self.sendMessage(' '.join(message)) | ||
|
||
def handleConnected(self): | ||
print(self.address, 'connected') | ||
|
||
def handleClose(self): | ||
print(self.address, 'closed') | ||
|
||
server = SimpleWebSocketServer('', port, Translate) | ||
server.serveforever() |
Oops, something went wrong.