In [1]:
import pandas as pd


In [None]:
df = pd.read_csv('books_df_slim.csv', sep=";")
df

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher,imageUrlS,imageUrlM,imageUrlL
0,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
1,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
2,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...
3,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...
4,0771074670,Nights Below Station Street,David Adams Richards,1988,Emblem Editions,http://images.amazon.com/images/P/0771074670.0...,http://images.amazon.com/images/P/0771074670.0...,http://images.amazon.com/images/P/0771074670.0...
...,...,...,...,...,...,...,...,...
40494,0394842359,Bambi Grows Up,Walt Disney,0,Random House~childrens,http://images.amazon.com/images/P/0394842359.0...,http://images.amazon.com/images/P/0394842359.0...,http://images.amazon.com/images/P/0394842359.0...
40495,0394758250,The Lady in the Lake (Vintage Crime/ Black Liz...,Raymond Chandler,1992,Vintage Books USA,http://images.amazon.com/images/P/0394758250.0...,http://images.amazon.com/images/P/0394758250.0...,http://images.amazon.com/images/P/0394758250.0...
40496,0373117345,"Tug Of Love (Harlequin Presents, No 1734)",Penny Jordan,1995,Harlequin,http://images.amazon.com/images/P/0373117345.0...,http://images.amazon.com/images/P/0373117345.0...,http://images.amazon.com/images/P/0373117345.0...
40497,0553241583,Anne of the Island (Anne of Green Gables Novel...,Lucy Maud Montgomery,1983,Bantam,http://images.amazon.com/images/P/0553241583.0...,http://images.amazon.com/images/P/0553241583.0...,http://images.amazon.com/images/P/0553241583.0...


In [3]:
import pandas as pd
import time
import re
import requests
import json
from tqdm import tqdm
import logging
import random
from urllib.parse import quote
import os
import pickle
from deep_translator import GoogleTranslator, MyMemoryTranslator

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class WorkingBookTranslator:
    def __init__(self):
        # Initialize translators
        self.google_translator = GoogleTranslator(source='en', target='uk')
        self.mymemory_translator = MyMemoryTranslator(source='en-US', target='uk-UA')
        
        # User agents for requests
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        ]
        
        # Simple transliteration mapping for Ukrainian
        self.transliteration_map = {
            'a': 'а', 'b': 'б', 'c': 'к', 'd': 'д', 'e': 'е', 'f': 'ф', 'g': 'г',
            'h': 'х', 'i': 'і', 'j': 'дж', 'k': 'к', 'l': 'л', 'm': 'м', 'n': 'н',
            'o': 'о', 'p': 'п', 'q': 'кв', 'r': 'р', 's': 'с', 't': 'т', 'u': 'у',
            'v': 'в', 'w': 'в', 'x': 'кс', 'y': 'й', 'z': 'з',
            'A': 'А', 'B': 'Б', 'C': 'К', 'D': 'Д', 'E': 'Е', 'F': 'Ф', 'G': 'Г',
            'H': 'Х', 'I': 'І', 'J': 'Дж', 'K': 'К', 'L': 'Л', 'M': 'М', 'N': 'Н',
            'O': 'О', 'P': 'П', 'Q': 'Кв', 'R': 'Р', 'S': 'С', 'T': 'Т', 'U': 'У',
            'V': 'В', 'W': 'В', 'X': 'Кс', 'Y': 'Й', 'Z': 'З'
        }
    
    def simple_transliterate(self, text):
        """Simple English to Ukrainian transliteration"""
        if not text or pd.isna(text):
            return text
        
        result = ""
        for char in str(text):
            result += self.transliteration_map.get(char, char)
        
        return result
    
    def is_likely_name(self, text):
        """Check if text is likely a person's name"""
        if not text or pd.isna(text):
            return False
        
        text_str = str(text).strip()
        
        # Common patterns for names
        name_patterns = [
            r'^[A-Z][a-z]+ [A-Z][a-z]+$',  # First Last
            r'^[A-Z]\. [A-Z][a-z]+$',      # F. Last
            r'^[A-Z][a-z]+, [A-Z][a-z]+$', # Last, First
            r'^[A-Z][a-z]+ [A-Z]\.$',      # First L.
        ]
        
        for pattern in name_patterns:
            if re.match(pattern, text_str):
                return True
        
        # Check if mostly capitalized words (likely names)
        words = text_str.split()
        if len(words) <= 4 and len(words) >= 1:
            capitalized_words = sum(1 for word in words if word and len(word) > 1 and word[0].isupper())
            return capitalized_words / len(words) > 0.5
        
        return False
    
    def translate_with_google_deep(self, text, max_retries=3):
        """Translate using deep-translator's GoogleTranslator"""
        if not text or pd.isna(text):
            return text
        
        text_str = str(text).strip()
        if not text_str:
            return text
        
        for attempt in range(max_retries):
            try:
                # Handle long texts by splitting
                if len(text_str) > 4500:  # Google has ~5000 char limit
                    # Split at sentence endings
                    sentences = re.split(r'(?<=[.!?])\s+', text_str)
                    translated_parts = []
                    
                    current_chunk = ""
                    for sentence in sentences:
                        if len(current_chunk + sentence) < 4500:
                            current_chunk += sentence + " "
                        else:
                            if current_chunk:
                                chunk_translation = self.google_translator.translate(current_chunk.strip())
                                translated_parts.append(chunk_translation)
                                time.sleep(0.5)
                            current_chunk = sentence + " "
                    
                    if current_chunk:
                        chunk_translation = self.google_translator.translate(current_chunk.strip())
                        translated_parts.append(chunk_translation)
                    
                    return " ".join(translated_parts)
                
                else:
                    result = self.google_translator.translate(text_str)
                    if result and result != text_str:
                        return result
                    
            except Exception as e:
                logger.warning(f"Google translate attempt {attempt + 1} failed for '{text_str[:50]}...': {e}")
                if attempt < max_retries - 1:
                    time.sleep(random.uniform(1, 3))
                    # Reinitialize translator on error
                    try:
                        self.google_translator = GoogleTranslator(source='en', target='uk')
                    except:
                        pass
        
        return text
    
    def translate_with_mymemory_deep(self, text, max_retries=2):
        """Translate using deep-translator's MyMemoryTranslator"""
        if not text or pd.isna(text):
            return text
        
        text_str = str(text).strip()
        if not text_str:
            return text
        
        for attempt in range(max_retries):
            try:
                # MyMemory has shorter limits
                if len(text_str) > 500:
                    # Split into smaller chunks
                    words = text_str.split()
                    chunks = []
                    current_chunk = []
                    current_length = 0
                    
                    for word in words:
                        if current_length + len(word) + 1 < 500:
                            current_chunk.append(word)
                            current_length += len(word) + 1
                        else:
                            if current_chunk:
                                chunks.append(' '.join(current_chunk))
                            current_chunk = [word]
                            current_length = len(word)
                    
                    if current_chunk:
                        chunks.append(' '.join(current_chunk))
                    
                    translated_chunks = []
                    for chunk in chunks:
                        try:
                            chunk_result = self.mymemory_translator.translate(chunk)
                            translated_chunks.append(chunk_result)
                            time.sleep(0.3)
                        except:
                            translated_chunks.append(chunk)  # Keep original on failure
                    
                    return ' '.join(translated_chunks)
                
                else:
                    result = self.mymemory_translator.translate(text_str)
                    if result and result != text_str:
                        return result
                    
            except Exception as e:
                logger.warning(f"MyMemory attempt {attempt + 1} failed for '{text_str[:50]}...': {e}")
                if attempt < max_retries - 1:
                    time.sleep(random.uniform(0.5, 1.5))
        
        return text
    
    def translate_with_libre(self, text, max_retries=2):
        """Translate using LibreTranslate API directly"""
        if not text or pd.isna(text):
            return text
        
        text_str = str(text).strip()
        if not text_str:
            return text
        
        for attempt in range(max_retries):
            try:
                url = "https://libretranslate.com/translate"
                data = {
                    'q': text_str,
                    'source': 'en',
                    'target': 'uk',
                    'format': 'text'
                }
                
                headers = {
                    'User-Agent': random.choice(self.user_agents),
                    'Content-Type': 'application/x-www-form-urlencoded'
                }
                
                response = requests.post(url, data=data, headers=headers, timeout=10)
                
                if response.status_code == 200:
                    result = response.json()
                    if 'translatedText' in result:
                        translated = result['translatedText']
                        if translated and translated != text_str:
                            return translated
                
            except Exception as e:
                logger.warning(f"LibreTranslate attempt {attempt + 1} failed for '{text_str[:50]}...': {e}")
                if attempt < max_retries - 1:
                    time.sleep(random.uniform(1, 2))
        
        return text
    
    def translate_text_smart(self, text, column_type="general"):
        """Smart translation with multiple fallback strategies"""
        if not text or pd.isna(text):
            return text
        
        text_str = str(text).strip()
        if not text_str:
            return text
        
        try:
            # For author names, be more conservative
            if column_type == "author" and self.is_likely_name(text_str):
                # Try Google first for names
                translated = self.translate_with_google_deep(text_str)
                if translated != text_str:
                    return translated
                
                # If no good translation, use transliteration
                return self.simple_transliterate(text_str)
            
            # For other content, try multiple services
            
            # Strategy 1: Google Translate
            translated = self.translate_with_google_deep(text_str)
            if translated != text_str and translated:
                return translated
            
            # Strategy 2: MyMemory
            translated = self.translate_with_mymemory_deep(text_str)
            if translated != text_str and translated:
                return translated
            
            # Strategy 3: LibreTranslate
            translated = self.translate_with_libre(text_str)
            if translated != text_str and translated:
                return translated
            
            # Final fallback: if it looks like a name, transliterate
            if self.is_likely_name(text_str):
                return self.simple_transliterate(text_str)
            
            # If all else fails, return original
            return text_str
            
        except Exception as e:
            logger.error(f"All translation failed for '{text_str[:50]}...': {e}")
            return text_str
    
    def save_progress(self, df, filename, current_column, current_index):
        """Save progress to file and create checkpoint"""
        try:
            # Save the dataframe
            df.to_csv(filename, index=False, encoding='utf-8')
            
            # Save checkpoint
            checkpoint = {
                'current_column': current_column,
                'current_index': current_index,
                'timestamp': time.time()
            }
            
            checkpoint_file = filename.replace('.csv', '_checkpoint.pkl')
            with open(checkpoint_file, 'wb') as f:
                pickle.dump(checkpoint, f)
            
            logger.info(f"Progress saved at column '{current_column}', row {current_index}")
            
        except Exception as e:
            logger.error(f"Failed to save progress: {e}")
    
    def load_checkpoint(self, filename):
        """Load checkpoint if it exists"""
        checkpoint_file = filename.replace('.csv', '_checkpoint.pkl')
        if os.path.exists(checkpoint_file):
            try:
                with open(checkpoint_file, 'rb') as f:
                    checkpoint = pickle.load(f)
                logger.info(f"Loaded checkpoint: column '{checkpoint['current_column']}', row {checkpoint['current_index']}")
                return checkpoint
            except Exception as e:
                logger.warning(f"Failed to load checkpoint: {e}")
        return None
    
    def translate_dataframe(self, df, columns_to_translate=['bookTitle'], 
                          output_filename='translated_books_ukrainian.csv', start_from_row=0, resume=True):
        """Translate dataframe with incremental saving"""
        
        logger.info(f"Starting translation of {len(df)} rows")
        logger.info(f"Columns to translate: {columns_to_translate}")
        
        # Create working copy
        df_work = df.copy()
        
        # Add Ukrainian columns
        for column in columns_to_translate:
            ukr_col = f"{column}_ukrainian"
            if ukr_col not in df_work.columns:
                df_work[ukr_col] = ""
        
        # Column types for different translation strategies
        column_types = {
            'bookAuthor': 'author',
            'bookTitle': 'title', 
            'publisher': 'publisher'
        }
        
        # Resume logic
        start_column_idx = 0
        start_row_idx = 0
        
        if resume:
            checkpoint = self.load_checkpoint(output_filename)
            if checkpoint:
                try:
                    start_column_idx = columns_to_translate.index(checkpoint['current_column'])
                    start_row_idx = checkpoint['current_index'] + 1  # Start from next row
                    logger.info(f"Resuming from column '{checkpoint['current_column']}', row {start_row_idx}")
                except (ValueError, KeyError):
                    logger.warning("Invalid checkpoint, starting fresh")
        
        # Process each column
        for col_idx, column in enumerate(columns_to_translate):
            if col_idx < start_column_idx:
                continue
                
            if column not in df_work.columns:
                logger.warning(f"Column '{column}' not found, skipping")
                continue
            
            logger.info(f"Processing column: {column}")
            column_type = column_types.get(column, "general")
            ukr_col = f"{column}_ukrainian"
            
            # Determine starting row
            row_start = start_row_idx if col_idx == start_column_idx else 0
            total_rows = len(df_work)
            
            # Process with progress bar
            with tqdm(total=total_rows - row_start, desc=f"Translating {column}") as pbar:
                for idx in range(row_start, total_rows):
                    # Skip if already translated
                    existing_translation = df_work.iloc[idx][ukr_col]
                    if pd.notna(existing_translation) and str(existing_translation).strip():
                        pbar.update(1)
                        continue
                    
                    # Get original text
                    original_text = df_work.iloc[idx][column]
                    
                    if pd.notna(original_text) and str(original_text).strip():
                        # Translate
                        translated_text = self.translate_text_smart(original_text, column_type)
                        df_work.iloc[idx, df_work.columns.get_loc(ukr_col)] = translated_text
                        
                        # Log sample translations
                        if idx % 500 == 0:
                            logger.info(f"Sample: '{original_text}' -> '{translated_text}'")
                    
                    pbar.update(1)
                    
                    # Save every 50 rows
                    if (idx + 1) % 50 == 0:
                        self.save_progress(df_work, output_filename, column, idx)
                    
                    # Respectful delay
                    time.sleep(random.uniform(0.3, 0.8))
            
            # Save after completing column
            self.save_progress(df_work, output_filename, column, total_rows - 1)
            start_row_idx = 0  # Reset for next column
        
        # Final save and cleanup
        df_work.to_csv(output_filename, index=False, encoding='utf-8')
        
        # Remove checkpoint file when done
        checkpoint_file = output_filename.replace('.csv', '_checkpoint.pkl')
        if os.path.exists(checkpoint_file):
            os.remove(checkpoint_file)
            logger.info("Checkpoint file removed - translation complete!")
        
        return df_work

def main(df):
    """Main execution function"""
    
    # Initialize translator
    translator = WorkingBookTranslator()
    
    logger.info(f"Dataframe shape: {df.shape}")
    logger.info(f"Columns: {df.columns.tolist()}")
    
    # Define columns to translate
    columns_to_translate = ['bookTitle']
    
    # Check which columns exist
    available_columns = [col for col in columns_to_translate if col in df.columns]
    missing_columns = [col for col in columns_to_translate if col not in df.columns]
    
    if missing_columns:
        logger.warning(f"Missing columns: {missing_columns}")
    
    if not available_columns:
        logger.error("No translatable columns found!")
        return
    
    logger.info(f"Will translate columns: {available_columns}")
    
    try:
        # Start translation
        output_file = 'books_translated_ukrainian.csv'
        logger.info(f"Starting translation process...")
        logger.info(f"Output will be saved to: {output_file}")
        logger.info("You can interrupt (Ctrl+C) and resume later")
        
        df_translated = translator.translate_dataframe(
            df, 
            columns_to_translate=available_columns,
            output_filename=output_file,
            start_from_row=400,
            resume=True
        )
        
        logger.info("🎉 Translation completed successfully!")
        logger.info(f"📁 Results saved to: {output_file}")
        
        # Show samples
        print("\n📖 Sample translations:")
        for col in available_columns[:2]:  # Show first 2 columns
            ukr_col = f"{col}_ukrainian"
            if ukr_col in df_translated.columns:
                print(f"\n{col} -> {ukr_col}:")
                for i in range(min(3, len(df_translated))):
                    orig = df_translated.iloc[i][col]
                    trans = df_translated.iloc[i][ukr_col]
                    print(f"  '{orig}' -> '{trans}'")
        
    except KeyboardInterrupt:
        logger.info("🛑 Translation paused. Run again to resume from where you left off.")
    except Exception as e:
        logger.error(f"❌ Translation failed: {e}")
        raise

# Required packages:
"""
pip install deep-translator pandas tqdm requests
"""

# if __name__ == "__main__":
#     main()

'\npip install deep-translator pandas tqdm requests\n'

In [19]:
df1 = pd.read_csv('models_l/books_translated_ukrainian_start.csv', encoding='utf-8', on_bad_lines="skip" )

In [20]:
df1

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher,imageUrlS,imageUrlM,imageUrlL,bookTitle_ukrainian
0,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,Клара Каллан
1,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,Грип: Історія про пандемію Великого грипу 1918...
2,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,Кухня Божа дружина
3,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...,http://images.amazon.com/images/P/0425176428.0...,"Що робити, якщо?: Найголовніші військові істор..."
4,0771074670,Nights Below Station Street,David Adams Richards,1988,Emblem Editions,http://images.amazon.com/images/P/0771074670.0...,http://images.amazon.com/images/P/0771074670.0...,http://images.amazon.com/images/P/0771074670.0...,Ночі нижче вулиці станції
...,...,...,...,...,...,...,...,...,...
40568,0394842359,Bambi Grows Up,Walt Disney,0,Random House~childrens,http://images.amazon.com/images/P/0394842359.0...,http://images.amazon.com/images/P/0394842359.0...,http://images.amazon.com/images/P/0394842359.0...,
40569,0394758250,The Lady in the Lake (Vintage Crime/ Black Liz...,Raymond Chandler,1992,Vintage Books USA,http://images.amazon.com/images/P/0394758250.0...,http://images.amazon.com/images/P/0394758250.0...,http://images.amazon.com/images/P/0394758250.0...,
40570,0373117345,"Tug Of Love (Harlequin Presents, No 1734)",Penny Jordan,1995,Harlequin,http://images.amazon.com/images/P/0373117345.0...,http://images.amazon.com/images/P/0373117345.0...,http://images.amazon.com/images/P/0373117345.0...,
40571,0553241583,Anne of the Island (Anne of Green Gables Novel...,Lucy Maud Montgomery,1983,Bantam,http://images.amazon.com/images/P/0553241583.0...,http://images.amazon.com/images/P/0553241583.0...,http://images.amazon.com/images/P/0553241583.0...,


In [4]:
df1 = pd.read_csv('models_l/books_translated_ukrainian_start.csv',  encoding='utf-8', on_bad_lines="skip")
df2 = pd.read_csv('models_l/books_translated_ukrainian_mid.csv', encoding='utf-8', on_bad_lines="skip")
df3 = pd.read_csv('models_l/books_translated_ukrainian_end.csv',  encoding='utf-8', on_bad_lines="skip")

In [None]:
combined_df = pd.concat([df1, df2, df3], ignore_index=True)

In [None]:
combined_df_sorted = combined_df.sort_values(
    by="bookTitle_ukrainian", 
    na_position='last'  
)
merged_df = combined_df_sorted.drop_duplicates(
    subset=["ISBN"], 
    keep='first'       
)
merged_df = merged_df.reset_index(drop=True)

In [None]:
merged_df

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher,imageUrlS,imageUrlM,imageUrlL,bookTitle_ukrainian
0,0373245327,Showdown! (Seven Devils),Laurie Paige,2003,Harlequin,http://images.amazon.com/images/P/0373245327.0...,http://images.amazon.com/images/P/0373245327.0...,http://images.amazon.com/images/P/0373245327.0...,! (Сім дияволів)
1,0452279186,!Yo!,Julia Alvarez,1997,Plume Books,http://images.amazon.com/images/P/0452279186.0...,http://images.amazon.com/images/P/0452279186.0...,http://images.amazon.com/images/P/0452279186.0...,! Йо!
2,0792277295,'A Hell of a Place to Lose a Cow': An American...,Tim Brookes,2001,National Geographic,http://images.amazon.com/images/P/0792277295.0...,http://images.amazon.com/images/P/0792277295.0...,http://images.amazon.com/images/P/0792277295.0...,"""Пекло місця для втрати корови"": американська ..."
3,1588204030,$oft Money: The True Power in Our Nation's Cap...,E. L. Burton,2000,1stBooks Library,http://images.amazon.com/images/P/1588204030.0...,http://images.amazon.com/images/P/1588204030.0...,http://images.amazon.com/images/P/1588204030.0...,$ oft гроші: справжня влада в столиці нашої кр...
4,0805054464,'G' Is for Grafton : The World of Kinsey Millhone,Natalie Hevener Kaufman,1997,Henry Holt Company,http://images.amazon.com/images/P/0805054464.0...,http://images.amazon.com/images/P/0805054464.0...,http://images.amazon.com/images/P/0805054464.0...,'G' - це для Графтона: Світ Кінсі Міллхоне
...,...,...,...,...,...,...,...,...,...
40513,1401088945,Ground Zero and Beyond,J. P. McCarthy,2003,Xlibris Corporation,http://images.amazon.com/images/P/1401088945.0...,http://images.amazon.com/images/P/1401088945.0...,http://images.amazon.com/images/P/1401088945.0...,Ґрунт нуля і далі
40514,055325278X,The Edgar Cayce Primer,Herbert Puryear,1985,Bantam Books,http://images.amazon.com/images/P/055325278X.0...,http://images.amazon.com/images/P/055325278X.0...,http://images.amazon.com/images/P/055325278X.0...,Ґрунтовка Едгара Кейса
40515,006105223X,Ground Zero Files (X-Files),Kevin J. Anderson,1995,Harpercollins Publisher,http://images.amazon.com/images/P/006105223X.0...,http://images.amazon.com/images/P/006105223X.0...,http://images.amazon.com/images/P/006105223X.0...,Ґрунтові нульові файли (X-файли)
40516,0340739762,Number9dream,David Mitchell,2001,Hodder Stoughton General Division,http://images.amazon.com/images/P/0340739762.0...,http://images.amazon.com/images/P/0340739762.0...,http://images.amazon.com/images/P/0340739762.0...,№9Dream


In [19]:
# Count NaN values
nan_count = merged_df['bookTitle_ukrainian'].isna().sum()

# Count empty strings (if applicable)
empty_str_count = (merged_df['bookTitle_ukrainian'] == '').sum()

# Total missing (NaN + empty strings)
total_missing = nan_count + empty_str_count

print(f"Rows with NaN 'bookTitle_ukrainian': {nan_count}")
print(f"Rows with empty string 'bookTitle_ukrainian': {empty_str_count}")
print(f"Total missing Ukrainian titles: {total_missing}")
print(f"Total rows in DataFrame: {len(merged_df)}")

Rows with NaN 'bookTitle_ukrainian': 0
Rows with empty string 'bookTitle_ukrainian': 0
Total missing Ukrainian titles: 0
Total rows in DataFrame: 40518


In [10]:
# Check for NaN or empty strings in 'bookTitle_ukrainian'
missing_mask = merged_df['bookTitle_ukrainian'].isna() | (merged_df['bookTitle_ukrainian'] == '')

# Create a new DataFrame with only missing titles
missing_titles_df = merged_df[missing_mask].copy()

# Display the result
print(f"Found {len(missing_titles_df)} rows with missing 'bookTitle_ukrainian':")
print(missing_titles_df)

Found 7791 rows with missing 'bookTitle_ukrainian':
             ISBN                    bookTitle             bookAuthor  \
32727  0553582747   From the Corner of His Eye            Dean Koontz   
32728  0060914068  Love, Medicine and Miracles  M.D. Bernie S. Siegel   
32729  0156047624           All the King's Men     Robert Penn Warren   
32730  0380715899   A Soldier of the Great War           Mark Helprin   
32731  0671623249                LONESOME DOVE         Larry McMurtry   
...           ...                          ...                    ...   
40513  0345313364             White Dragon #03         Anne McCaffrey   
40514  0440236843                  Texas Woman          JOAN JOHNSTON   
40515  0440234719                    The Texan          JOAN JOHNSTON   
40516  0440223776                The Bodyguard          Joan Johnston   
40517  067173976X             Counterfeit Lady          Jude Deveraux   

      yearOfPublication                 publisher  \
32727             

In [11]:
missing_titles_df

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher,imageUrlS,imageUrlM,imageUrlL,bookTitle_ukrainian
32727,0553582747,From the Corner of His Eye,Dean Koontz,2001,Bantam Books,http://images.amazon.com/images/P/0553582747.0...,http://images.amazon.com/images/P/0553582747.0...,http://images.amazon.com/0060914068,
32728,0060914068,"Love, Medicine and Miracles",M.D. Bernie S. Siegel,1988,HarperCollins Publishers,http://images.amazon.com/images/P/0060914068.0...,http://images.amazon.com/images/P/0060914068.0...,http://images.amazon.com/images/P/0060914068.0...,
32729,0156047624,All the King's Men,Robert Penn Warren,1982,Harvest Books,http://images.amazon.com/images/P/0156047624.0...,http://images.amazon.com/images/P/0156047624.0...,http://images.amazon.com/images/P/0156047624.0...,
32730,0380715899,A Soldier of the Great War,Mark Helprin,1992,Avon Books,http://images.amazon.com/images/P/0380715899.0...,http://images.amazon.com/images/P/0380715899.0...,http://images.amazon.com/images/P/0380715899.0...,
32731,0671623249,LONESOME DOVE,Larry McMurtry,1986,Pocket,http://images.amazon.com/images/P/0671623249.0...,http://images.amazon.com/images/P/0671623249.0...,http://images.amazon.com/images/P/0671623249.0...,
...,...,...,...,...,...,...,...,...,...
40513,0345313364,White Dragon #03,Anne McCaffrey,1983,Del Rey Books,http://images.amazon.com/images/P/0345313364.0...,http://images.amazon.com/images/P/0345313364.0...,http://images.amazon.com/images/P/0345313364.0...,
40514,0440236843,Texas Woman,JOAN JOHNSTON,2003,Dell,http://images.amazon.com/images/P/0440236843.0...,http://images.amazon.com/images/P/0440236843.0...,http://images.amazon.com/images/P/0440236843.0...,
40515,0440234719,The Texan,JOAN JOHNSTON,2001,Dell,http://images.amazon.com/images/P/0440234719.0...,http://images.amazon.com/images/P/0440234719.0...,http://images.amazon.com/images/P/0440234719.0...,
40516,0440223776,The Bodyguard,Joan Johnston,1998,Island,http://images.amazon.com/images/P/0440223776.0...,http://images.amazon.com/images/P/0440223776.0...,http://images.amazon.com/images/P/0440223776.0...,


In [12]:
if __name__ == "__main__":
    main(missing_titles_df)

2025-05-30 19:47:01,779 - INFO - Dataframe shape: (7791, 9)
2025-05-30 19:47:01,784 - INFO - Columns: ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL', 'bookTitle_ukrainian']
2025-05-30 19:47:01,786 - INFO - Will translate columns: ['bookTitle']
2025-05-30 19:47:01,788 - INFO - Starting translation process...
2025-05-30 19:47:01,792 - INFO - Output will be saved to: books_translated_ukrainian.csv
2025-05-30 19:47:01,793 - INFO - You can interrupt (Ctrl+C) and resume later
2025-05-30 19:47:01,794 - INFO - Starting translation of 7791 rows
2025-05-30 19:47:01,795 - INFO - Columns to translate: ['bookTitle']
2025-05-30 19:47:01,828 - INFO - Loaded checkpoint: column 'bookTitle', row 5499
2025-05-30 19:47:01,828 - INFO - Resuming from column 'bookTitle', row 5500
2025-05-30 19:47:01,831 - INFO - Processing column: bookTitle
Translating bookTitle:   0%|          | 0/2291 [00:00<?, ?it/s]2025-05-30 19:47:02,297 - INFO - Sample: 'Othe


📖 Sample translations:

bookTitle -> bookTitle_ukrainian:
  'From the Corner of His Eye' -> 'nan'
  'Love, Medicine and Miracles' -> 'nan'
  'All the King's Men' -> 'nan'


In [None]:
df1 = pd.read_csv('models_l/books_translated_ukrainian_start.csv',  encoding='utf-8', on_bad_lines="skip")
df2 = pd.read_csv('models_l/books_translated_ukrainian_mid.csv', encoding='utf-8', on_bad_lines="skip")
df3 = pd.read_csv('models_l/books_translated_ukrainian_end.csv',  encoding='utf-8', on_bad_lines="skip")
df4 = pd.read_csv('models_l/books_translated_ukrainian_add.csv',  encoding='utf-8', on_bad_lines="skip")
df5 = pd.read_csv('models_l/books_translated_ukrainian_add2.csv', encoding='utf-8', on_bad_lines="skip")

In [None]:
combined_df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)

In [None]:
combined_df_sorted = combined_df.sort_values(
    by="bookTitle_ukrainian", 
    na_position='last'  
)
merged_df = combined_df_sorted.drop_duplicates(
    subset=["ISBN"], 
    keep='first'       
)
merged_df = merged_df.reset_index(drop=True)

In [21]:
merged_df

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher,imageUrlS,imageUrlM,imageUrlL,bookTitle_ukrainian
0,0373245327,Showdown! (Seven Devils),Laurie Paige,2003,Harlequin,http://images.amazon.com/images/P/0373245327.0...,http://images.amazon.com/images/P/0373245327.0...,http://images.amazon.com/images/P/0373245327.0...,! (Сім дияволів)
1,0452279186,!Yo!,Julia Alvarez,1997,Plume Books,http://images.amazon.com/images/P/0452279186.0...,http://images.amazon.com/images/P/0452279186.0...,http://images.amazon.com/images/P/0452279186.0...,! Йо!
2,0792277295,'A Hell of a Place to Lose a Cow': An American...,Tim Brookes,2001,National Geographic,http://images.amazon.com/images/P/0792277295.0...,http://images.amazon.com/images/P/0792277295.0...,http://images.amazon.com/images/P/0792277295.0...,"""Пекло місця для втрати корови"": американська ..."
3,1588204030,$oft Money: The True Power in Our Nation's Cap...,E. L. Burton,2000,1stBooks Library,http://images.amazon.com/images/P/1588204030.0...,http://images.amazon.com/images/P/1588204030.0...,http://images.amazon.com/images/P/1588204030.0...,$ oft гроші: справжня влада в столиці нашої кр...
4,0805054464,'G' Is for Grafton : The World of Kinsey Millhone,Natalie Hevener Kaufman,1997,Henry Holt Company,http://images.amazon.com/images/P/0805054464.0...,http://images.amazon.com/images/P/0805054464.0...,http://images.amazon.com/images/P/0805054464.0...,'G' - це для Графтона: Світ Кінсі Міллхоне
...,...,...,...,...,...,...,...,...,...
40513,1401088945,Ground Zero and Beyond,J. P. McCarthy,2003,Xlibris Corporation,http://images.amazon.com/images/P/1401088945.0...,http://images.amazon.com/images/P/1401088945.0...,http://images.amazon.com/images/P/1401088945.0...,Ґрунт нуля і далі
40514,055325278X,The Edgar Cayce Primer,Herbert Puryear,1985,Bantam Books,http://images.amazon.com/images/P/055325278X.0...,http://images.amazon.com/images/P/055325278X.0...,http://images.amazon.com/images/P/055325278X.0...,Ґрунтовка Едгара Кейса
40515,006105223X,Ground Zero Files (X-Files),Kevin J. Anderson,1995,Harpercollins Publisher,http://images.amazon.com/images/P/006105223X.0...,http://images.amazon.com/images/P/006105223X.0...,http://images.amazon.com/images/P/006105223X.0...,Ґрунтові нульові файли (X-файли)
40516,0340739762,Number9dream,David Mitchell,2001,Hodder Stoughton General Division,http://images.amazon.com/images/P/0340739762.0...,http://images.amazon.com/images/P/0340739762.0...,http://images.amazon.com/images/P/0340739762.0...,№9Dream


In [22]:
# ISBNs in df but not in merged_df
missing_in_merged = df[~df['ISBN'].isin(merged_df['ISBN'])]['ISBN']

# ISBNs in merged_df but not in df
missing_in_df = merged_df[~merged_df['ISBN'].isin(df['ISBN'])]['ISBN']

In [23]:
print(f"Unique ISBNs in df: {df['ISBN'].nunique()}")
print(f"Unique ISBNs in merged_df: {merged_df['ISBN'].nunique()}")

Unique ISBNs in df: 40499
Unique ISBNs in merged_df: 40518


In [24]:
# In df
df_duplicates = df[df['ISBN'].duplicated(keep=False)]

# In merged_df
merged_duplicates = merged_df[merged_df['ISBN'].duplicated(keep=False)]

In [26]:
df_duplicates

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher,imageUrlS,imageUrlM,imageUrlL


In [27]:
comparison = df.merge(
    merged_df,
    on='ISBN',
    how='outer',
    indicator=True
)

# Breakdown of matches
print(comparison['_merge'].value_counts())

_merge
both          40499
right_only       19
left_only         0
Name: count, dtype: int64


In [28]:
print(f"Rows only in df: {len(missing_in_merged)}")
print(f"Rows only in merged_df: {len(missing_in_df)}")
print(f"Common ISBNs: {len(df.merge(merged_df, on='ISBN'))}")

Rows only in df: 0
Rows only in merged_df: 19
Common ISBNs: 40499


In [29]:
# Keep only rows in merged_df that have ISBNs present in df
merged_df_filtered = merged_df[merged_df['ISBN'].isin(df['ISBN'])]

# Verify the count matches df (40499 rows)
print(len(merged_df_filtered))  # Should output 40499

40499


In [31]:
merged_df_filtered['bookTitle'] = merged_df_filtered['bookTitle_ukrainian']
final_df = merged_df_filtered.drop('bookTitle_ukrainian', axis=1)  # Remove old column

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_filtered['bookTitle'] = merged_df_filtered['bookTitle_ukrainian']


In [33]:
final_df.to_csv('translated_books.csv')