In [1]:
import pandas as pd
import numpy as np
import os
import time
from tqdm import tqdm

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

from joblib import Parallel, delayed
import tempfile
import shutil

In [2]:
data = pd.read_parquet("full_qanun_text.parquet")

display(data.head())
display(data.tail())
data.info()
display(len(data['text'].value_counts()))

Unnamed: 0,text
0,azərbaycan respublikası mülki məcəlləsinin təs...
1,azərbaycan respublikasının mülki məcəlləsi təs...
2,azərbaycan respublikasının mülki məcəlləsi qüv...
3,azərbaycan respublikasının mülki məcəlləsinə d...
4,azərbaycan ssr mülki mülki prosessual məcəlləl...


Unnamed: 0,text
1501968,maddədə istehsal məişət tullantıları sözləri t...
1501969,maddədə istehsal məişət tullantıları sözləri t...
1501970,maddədə sənaye məişət tullantılarının sözləri ...
1501971,qiymətli metallar qiymətli daşlar azərbaycan r...
1501972,yaşıllıqların mühafizəsi azərbaycan respublika...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1501973 entries, 0 to 1501972
Data columns (total 1 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   text    1501973 non-null  object
dtypes: object(1)
memory usage: 11.5+ MB


970817

In [3]:
batch = pd.read_parquet("E:/Software/Data Science and AI/NLP/Edliyye/Legal Acts Question Answering/NLP project/Translation in Batches/batch_55_with_translation.parquet")
batch

Unnamed: 0,text,translated_text
0,azərbaycan respublikası prezidentinin fərmanı,Decree of the President of the Republic of Aze...
1,ölkəmizdə demokratik islahatların davam etdiri...,Continuation of democratic reforms in our coun...
2,azadlıqdan məhrumetmə cəzasına məhkum olunmuş ...,The following women sentenced to imprisonment ...
3,cavadova sarıtel misir qızı anadan olmuş azərb...,Javadova Saritel was sentenced by January-date...
4,klimakova tatyana ivanovna anadan olmuş azərba...,Klimakova Tatiana Ivanovna was born the citize...
5,tağıyeva tamilla qulam qızı anadan olmuş azərb...,Tagiyeva Tamilla Gülam's daughter was born the...
6,ii cinayəti yetkinlik yaşına çatmamışdan əvvəl...,The following persons sentenced to imprisonmen...
7,jitkov vitali vladimiroviç anadan olmuş azərba...,Jitkov Vitali Vladimirovich was born the citiz...
8,ocaqov asəf məbud oğlu anadan olmuş azərbaycan...,"The citizen of the Republic of Azerbaijan, who..."
9,yeqoyev ramin ismayıl oğlu anadan olmuş azərba...,The citizen of the Republic of Azerbaijan was ...


In [6]:
inx = 2

display(batch['text'].iloc[inx])
display(batch['translated_text'].iloc[inx])

'azadlıqdan məhrumetmə cəzasına məhkum olunmuş aşağıdakı qadınlar cəzanın çəkilməmiş hissəsindən azad edilsinlər'

'The following women sentenced to imprisonment shall be exempt from the unbelieved part of the sentence'

In [3]:
def translate_with_selenium(text, src_language='az', target_language='en'):
    # Setup WebDriver options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(f"https://translate.google.com/?sl={src_language}&tl={target_language}&op=translate")
    
    translated_text = ""
    try:
        # Wait for the input text box to load
        input_box = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'textarea[aria-label="Source text"]'))
        )
        
        # Clear the text field and type the text to translate
        input_box.clear()
        input_box.send_keys(text)
        
        # Wait for translation to appear in the output box
        output_box = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'span[jsname="W297wb"]'))
        )
        
        # Extract and return the translated text
        translated_text = output_box.text
        
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        driver.quit()
        
    return translated_text

In [None]:
# Define batch size
batch_size = 10
num_batches = (len(data) // batch_size) + 1

# Ensure the output directory exists
output_dir = 'E:/Software/Data Science and AI/NLP/Edliyye/Legal Acts Question Answering/NLP project/Translation in Batches'
os.makedirs(output_dir, exist_ok=True)

# Loop over batches
for i in range(num_batches):
    batch_output_path = os.path.join(output_dir, f'batch_{i}_with_translation.parquet')
    
    # Check if the batch file already exists
    if os.path.exists(batch_output_path):
        print(f'Batch {i} already processed. Skipping...')
        continue
    
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(data))
    
    # Select the batch
    batch = data.iloc[start_idx:end_idx].copy()
    
    # Create a new column for translated text
    translated_texts = []
    
    for text in batch['text']:
        translated_text = translate_with_selenium(text)
        translated_texts.append(translated_text)
        time.sleep(3)  # Delay of n seconds between each translation to avoid rate limiting
    
    batch['translated_text'] = translated_texts
    
    # Save the batch to a .parquet file
    batch.to_parquet(batch_output_path, index=False)
    print(f'Saved batch {i}')

Batch 0 already processed. Skipping...
Batch 1 already processed. Skipping...
Batch 2 already processed. Skipping...
Batch 3 already processed. Skipping...
Batch 4 already processed. Skipping...
Batch 5 already processed. Skipping...
Batch 6 already processed. Skipping...
Batch 7 already processed. Skipping...
Batch 8 already processed. Skipping...
Batch 9 already processed. Skipping...
Batch 10 already processed. Skipping...
Batch 11 already processed. Skipping...
Batch 12 already processed. Skipping...
Batch 13 already processed. Skipping...
Batch 14 already processed. Skipping...
Batch 15 already processed. Skipping...
Batch 16 already processed. Skipping...
Batch 17 already processed. Skipping...
Batch 18 already processed. Skipping...
Batch 19 already processed. Skipping...
Batch 20 already processed. Skipping...
Batch 21 already processed. Skipping...
Batch 22 already processed. Skipping...
Batch 23 already processed. Skipping...
Batch 24 already processed. Skipping...
Batch 25 a

- 1_501_973 rows / 10 rows in a batch = 150_197.3 batches.
- 150_197 batches * 3 min. / 10 rows in a batch = 45_059.1 minutes.
- 45_059.1 minutes / 60 minutes = 750 hours
- 750 hours / 24 hours = 31.3 days

## Concurrent Processing

In [None]:
# Function to translate text using Selenium and Google Translate
def translate_with_selenium(text, src_language='az', target_language='en'):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(f"https://translate.google.com/?sl={src_language}&tl={target_language}&op=translate")
    
    translated_text = ""
    try:
        input_box = WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'textarea[aria-label="Source text"]'))
        )
        
        input_box.clear()
        input_box.send_keys(text)
        
        output_box = WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'span[jsname="W297wb"]'))
        )
        
        translated_text = output_box.text
        
    except Exception as e:
        print(f"An error occurred: {e}", flush=True)
    finally:
        driver.quit()
        
    return translated_text

# Function to process a batch and save it to a file
def process_batch(batch, batch_output_path):
    translated_texts = []
    for text in batch['text']:
        translated_text = translate_with_selenium(text)
        translated_texts.append(translated_text)
        time.sleep(3)  # Delay to avoid rate limiting
    batch['translated_text'] = translated_texts
    batch.to_parquet(batch_output_path, index=False)
    print(f'Saved {batch_output_path}', flush=True)
    return batch_output_path

# Define batch size
batch_size = 10
num_batches = (len(data) // batch_size) + 1

# Ensure the output directory exists
output_dir = 'E:/Software/Data Science and AI/NLP/Edliyye/Legal Acts Question Answering/NLP project/Translation in Batches'
os.makedirs(output_dir, exist_ok=True)

# Function to process and save each batch in parallel
def process_and_save_batch(i):
    batch_output_path = os.path.join(output_dir, f'batch_{i}_with_translation.parquet')
    
    if os.path.exists(batch_output_path):
        print(f'Batch {i} already processed. Skipping...', flush=True)
        return batch_output_path
    
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(data))
    
    batch = data.iloc[start_idx:end_idx].copy()
    return process_batch(batch, batch_output_path)

# Use joblib to process batches in parallel
try:
    Parallel(n_jobs=2, backend="loky", verbose=10)(delayed(process_and_save_batch)(i) for i in range(num_batches))
except Exception as e:
    print(f"An error occurred during parallel processing: {e}", flush=True)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    4.0s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    7.1s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:   12.1s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:   16.9s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:   23.7s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   30.4s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   38.9s
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   47.9s


In [None]:
# Function to translate text using Selenium and Google Translate
def translate_with_selenium(text, src_language='az', target_language='en'):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(f"https://translate.google.com/?sl={src_language}&tl={target_language}&op=translate")
    
    translated_text = ""
    try:
        input_box = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'textarea[aria-label="Source text"]'))
        )
        
        input_box.clear()
        input_box.send_keys(text)
        
        output_box = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'span[jsname="W297wb"]'))
        )
        
        translated_text = output_box.text
        
    except Exception as e:
        print(f"An error occurred: {e}", flush=True)
    finally:
        driver.quit()
        
    return translated_text

# Function to process a batch and save it to a file
def process_batch(batch, batch_output_path):
    translated_texts = []
    for text in batch['text']:
        translated_text = translate_with_selenium(text)
        translated_texts.append(translated_text)
        time.sleep(3)  # Delay to avoid rate limiting
    batch['translated_text'] = translated_texts
    batch.to_parquet(batch_output_path, index=False)
    print(f'Saved {batch_output_path}', flush=True)
    return batch_output_path

# Load data
data = pd.read_parquet("full_qanun_text.parquet")

# Define batch size
batch_size = 10
num_batches = (len(data) // batch_size) + 1

# Ensure the output directory exists
output_dir = 'E:/Software/Data Science and AI/NLP/Edliyye/Legal Acts Question Answering/NLP project/Translation in Batches'
os.makedirs(output_dir, exist_ok=True)

# Function to process and save each batch in parallel
def process_and_save_batch(i):
    batch_output_path = os.path.join(output_dir, f'batch_{i}_with_translation.parquet')
    
    if os.path.exists(batch_output_path):
        print(f'Batch {i} already processed. Skipping...', flush=True)
        return batch_output_path
    
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(data))
    
    batch = data.iloc[start_idx:end_idx].copy()
    return process_batch(batch, batch_output_path)

# Create a temporary directory for joblib
temp_dir = tempfile.mkdtemp()

# Use joblib to process batches in parallel
try:
    with parallel_backend('loky', inner_max_num_threads=1):
        Parallel(n_jobs=2, temp_folder=temp_dir, verbose=5)(delayed(process_and_save_batch)(i) for i in range(num_batches))
except Exception as e:
    print(f"An error occurred during parallel processing: {e}", flush=True)
finally:
    # Clean up temporary directory
    shutil.rmtree(temp_dir)

## Check and Retry for Empty Translations

In [None]:
def translate_with_selenium(text, src_language='az', target_language='en', retries=3):
    # Setup WebDriver options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    
    translated_text = ""
    attempt = 0
    while attempt < retries:
        attempt += 1
        try:
            driver = webdriver.Chrome(options=chrome_options)
            driver.get(f"https://translate.google.com/?sl={src_language}&tl={target_language}&op=translate")

            # Wait for the input text box to load
            input_box = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'textarea[aria-label="Source text"]'))
            )

            # Clear the text field and type the text to translate
            input_box.clear()
            input_box.send_keys(text)

            # Wait for translation to appear in the output box
            output_box = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'span[jsname="W297wb"]'))
            )

            # Extract and return the translated text
            translated_text = output_box.text
            driver.quit()
            break  # Exit the loop if translation is successful

        except Exception as e:
            print(f"An error occurred on attempt {attempt}: {e}")
            if attempt == retries:
                print("Max retries reached. Skipping translation for this text.")
            driver.quit()
            time.sleep(2)  # Wait before retrying

    return translated_text

# Define batch size
batch_size = 10
num_batches = (len(data) // batch_size) + 1

# Ensure the output directory exists
output_dir = 'E:/Software/Data Science and AI/NLP/Edliyye/Legal Acts Question Answering/NLP project/Translation in Batches'
os.makedirs(output_dir, exist_ok=True)

# Loop over batches
for i in range(num_batches):
    batch_output_path = os.path.join(output_dir, f'batch_{i}_with_translation.parquet')
    
    # Check if the batch file already exists
    if os.path.exists(batch_output_path):
        print(f'Batch {i} already processed. Skipping...')
        continue
    
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(data))
    
    # Select the batch
    batch = data.iloc[start_idx:end_idx].copy()
    
    # Create a new column for translated text
    translated_texts = []
    
    for text in batch['text']:
        translated_text = translate_with_selenium(text)
        translated_texts.append(translated_text)
        time.sleep(5)  # Delay of 5 seconds between each translation to avoid rate limiting
    
    batch['translated_text'] = translated_texts

    # Check for empty translations and retry
    empty_translations = batch[batch['translated_text'].isna() | (batch['translated_text'] == '')]
    if not empty_translations.empty:
        print(f"Retrying empty translations in batch {i}...")
        for index, row in empty_translations.iterrows():
            translated_text = translate_with_selenium(row['text'])
            batch.at[index, 'translated_text'] = translated_text
            time.sleep(5)  # Delay of 5 seconds between each translation to avoid rate limiting
    
    # Save the batch to a .parquet file
    batch.to_parquet(batch_output_path, index=False)
    print(f'Saved batch {i}')