Skip to content

Commit

Permalink
Merge pull request #9 from NeiH4207/translate-docx-file
Browse files Browse the repository at this point in the history
update translator for docx file
  • Loading branch information
NeiH4207 committed Jun 3, 2024
2 parents f09db1d + a797cc2 commit 8913a96
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 31 deletions.
45 changes: 22 additions & 23 deletions .github/workflows/python-package-conda.yml
Original file line number Diff line number Diff line change
@@ -1,34 +1,33 @@
name: Python Package using Conda
name: transipy

on: [push]
on: [push, pull_request]

jobs:
build-linux:
build:
runs-on: ubuntu-latest
strategy:
max-parallel: 5

steps:
- uses: actions/checkout@v4
- name: Set up Python 3.10
uses: actions/setup-python@v3
- name: Checkout repository
uses: actions/checkout@v2

- name: Set up Conda environment
uses: conda-incubator/setup-miniconda@v2
with:
python-version: '3.10'
- name: Add conda to system path
run: |
# $CONDA is an environment variable pointing to the root of the miniconda directory
echo $CONDA/bin >> $GITHUB_PATH
- name: Install dependencies
environment-file: environment.yml
python-version: 3.9
auto-update-conda: true

- name: Set Conda solver to classic and disable plugins
run: |
conda env update --file environment.yml --name base
- name: Lint with flake8
conda config --set solver classic
CONDA_NO_PLUGINS=true conda env update --file environment.yml --name base
- name: Install additional pip dependencies
run: |
conda install flake8
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
source activate base
pip install -r requirements.txt
- name: Run tests
run: |
conda install pytest
source activate base
pytest
8 changes: 8 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name: transipy
channels:
- defaults

dependencies:
- python=3.9
- pytest
- pip
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
loguru==0.7.2
pandas==2.2.2
bottleneck==1.3.6
googletrans==4.0.0rc1
openpyxl==3.1.2
requests==2.31.0
h2==3.2.0
httpx==0.13.3
httpx==0.13.3
python-docx==1.1.2
17 changes: 13 additions & 4 deletions transipy/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
import os
from loguru import logger
import pandas as pd
from transipy.trans_helper import translate_df, translate_excel
from transipy.trans_helper import translate_df, translate_docx, translate_excel
from transipy.utils import *
from googletrans import LANGUAGES

def get_parser():
parser = argparse.ArgumentParser(
description='Translate text in a file (.csv/.txt) from source language to target language.',
description='Translate text in a file (.csv/.txt/.tsv/.docx/.xlsx) from source language to target language.',
)
parser.add_argument('-f', '--file-path', type=str, required=True, help='The source file path')
parser.add_argument('-l', '--sep', type=str, default=None, help='The separator of the file [comma, tab, space,...]')
Expand Down Expand Up @@ -60,7 +60,7 @@ def main():
file_extension = get_file_extension(input_file)

if not is_supported_file(input_file):
logger.error("Unsupported file format. Please use .csv/.txt or .xlsx files.")
logger.error("Unsupported file format. Please use .csv/.tsv/.txt/.docx or .xlsx files.")
return

if not args.output_file:
Expand Down Expand Up @@ -133,6 +133,15 @@ def main():
)
with open(output_file, 'w') as f:
f.write('\n'.join(df['text'].tolist()))
elif is_docx(file_extension):
translate_docx(
file_path=input_file,
src=source_language,
dest=target_language,
chunk_size=chunk_size,
dictionary=dictionary,
output_file=output_file
)
else:
logger.error("Unsupported file format. Please use .csv/.tsv/.txt or .xlsx files.")
return
return
33 changes: 31 additions & 2 deletions transipy/trans_helper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from time import sleep
import time
import docx
import requests
import ast
import requests
Expand Down Expand Up @@ -147,4 +147,33 @@ def translate_excel(dfs, sheets=None, src='en', dest='vi', chunk_size=4, diction
with pd.ExcelWriter(output_file) as writer:
for sheet_name in sheets:
dfs[sheet_name].to_excel(writer, sheet_name=sheet_name, index=False)
return dfs
return dfs

def translate_docx(file_path, src='en', dest='vi', chunk_size=4, dictionary={}, output_file=None):
doc = docx.Document(file_path)
translated_doc = docx.Document()

raw_docs = []

for paragraph in doc.paragraphs:
raw_docs.append(paragraph.text)

df = pd.DataFrame(raw_docs, columns=['text'])

df = translate_df(
df=df,
columns=['text'],
src=src,
dest=dest,
chunk_size=chunk_size,
dictionary=dictionary,
output_file=output_file
)

for text in df['text']:
translated_doc.add_paragraph(text)

if output_file:
translated_doc.save(output_file)

return translated_doc
5 changes: 4 additions & 1 deletion transipy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ def is_excel(file_path):
def is_text(file_path):
return get_file_extension(file_path) == '.txt'

def is_docx(file_path):
return get_file_extension(file_path) == '.docx'

def get_separetor(sep):
if sep == 'comma':
return ','
Expand All @@ -34,7 +37,7 @@ def get_all_excel_sheet_names(file_path, skip_list: list = []):
return [sheet for sheet in sheets if sheet not in skip_list]

def is_supported_file(file_path):
return is_csv(file_path) or is_tsv(file_path) or is_excel(file_path) or is_text(file_path)
return is_csv(file_path) or is_tsv(file_path) or is_excel(file_path) or is_text(file_path) or is_docx(file_path)


def split_df_by_group(df, chunks):
Expand Down

0 comments on commit 8913a96

Please sign in to comment.