Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgtied committed Aug 21, 2019
0 parents commit 331fdb8
Show file tree
Hide file tree
Showing 6 changed files with 1,345 additions and 0 deletions.
22 changes: 22 additions & 0 deletions README.md
@@ -0,0 +1,22 @@

# OpenTrans

Tools for open translation services

* based on MarianNMT
* trained on OPUS data



## Setup


* Start a marian-server with the model that you want to support

```
~/marian/build/marian-server -p 11111 -b2 -n1 -m /media/letsmt/nmt/models/de+fr+sv+en-et+hu+fi/opus-wmt.bpe32k-bpe32k.enfi.transformer.model1.npz.best-perplexity.npz -v /media/letsmt/nmt/models/de+fr+sv+en-et+hu+fi/opus-wmt.bpe32k-bpe32k.enfi.vocab.yml /media/letsmt/nmt/models/de+fr+sv+en-et+hu+fi/opus-wmt.bpe32k-bpe32k.enfi.vocab.yml
```

* edit the translation server script and start it as well
# access from a client

43 changes: 43 additions & 0 deletions opentrans-client.py
@@ -0,0 +1,43 @@
#!/usr/bin/env python

from __future__ import print_function, unicode_literals, division

import sys
import time
import argparse

from websocket import create_connection


if __name__ == "__main__":
# handle command-line options
parser = argparse.ArgumentParser()
parser.add_argument("-b", "--batch-size", type=int, default=1)
parser.add_argument("-p", "--port", type=int, default=8080)
args = parser.parse_args()

# open connection
ws = create_connection("ws://86.50.168.81:{}/translate".format(args.port))

count = 0
batch = ""
for line in sys.stdin:
count += 1
batch += line.decode('utf-8') if sys.version_info < (3, 0) else line
if count == args.batch_size:
# translate the batch
ws.send(batch)
result = ws.recv()
print(result.rstrip())

count = 0
batch = ""

if count:
# translate the remaining sentences
ws.send(batch)
result = ws.recv()
print(result.rstrip())

# close connection
ws.close()
114 changes: 114 additions & 0 deletions opentrans-server.py
@@ -0,0 +1,114 @@
#!/usr/bin/env python3
#-*-python-*-
#
#

import codecs
import pycld2 as cld2
from mosestokenizer import *
from websocket import create_connection

from apply_bpe import BPE


#####################################################################
#### TODO: all of this should be handled with command-line options!

port = 8080

## specify the server that runs marian-decoder
marian_server = 'localhost'
marian_port = 11111

## languages that can be translated from and translated into
srclangs = ['de','fr','sv','en']
trglangs = ['et','hu','fi']
default_trg = 'fi'

## BPE model for pre-processing
BPEmodel = '/media/letsmt/nmt/models/de+fr+sv+en-et+hu+fi/opus-wmt.src.bpe32k-model'
BPEcodes = codecs.open(BPEmodel, encoding='utf-8')
bpe = BPE(BPEcodes)

################################################################


## pre- and post-processing tools
tokenizer = {}
sentence_splitter = {}
normalizer = {}
detokenizer = {}

for l in srclangs:
sentence_splitter[l] = MosesSentenceSplitter(l)
normalizer[l] = MosesPunctuationNormalizer(l)
tokenizer[l] = MosesTokenizer(l)

for l in trglangs:
detokenizer[l] = MosesDetokenizer(l)


# open connection
ws = create_connection("ws://{}:{}/translate".format(marian_server, marian_port))


from SimpleWebSocketServer import SimpleWebSocketServer, WebSocket

class Translate(WebSocket):

def handleMessage(self):

fromLang = None
toLang = default_trg
prefix = ''

## check whether the first token specifies the language pair
tokens = self.data.split()
langs = tokens.pop(0).split('-')
if len(langs) == 2:
toLang = langs[1]
if langs[0] != 'DL':
fromLang = langs[0]
self.data = ' '.join(tokens)

if len(trglangs) > 1:
prefix = '>>' + toLang + '<< '

if not fromLang:
isReliable, textBytesFound, details = cld2.detect(self.data, bestEffort=True)
fromLang = details[0][1]
print("language detected = " + fromLang)

if not fromLang in srclangs:
print('unsupported source language ' + fromLang)
self.sendMessage('ERROR: unsupported source language ' + fromLang)
return

if not toLang in trglangs:
print('unsupported target language ' + toLang)
self.sendMessage('ERROR: unsupported target language ' + toLang)
return

message = []
for s in sentence_splitter[fromLang]([normalizer[fromLang](self.data)]):
# print(s)
tokenized = ' '.join(tokenizer[fromLang](s))
# print(tokenized)
segmented = bpe.process_line(tokenized)
# print(prefix + segmented)
ws.send(prefix + segmented)
translated = ws.recv().replace('@@ ','')
# print(translated)
detokenized = detokenizer[toLang](translated.split())
print('TRANSLATION: ' + detokenized)
message.append(detokenized)
self.sendMessage(' '.join(message))

def handleConnected(self):
print(self.address, 'connected')

def handleClose(self):
print(self.address, 'closed')

server = SimpleWebSocketServer('', port, Translate)
server.serveforever()

0 comments on commit 331fdb8

Please sign in to comment.