In [1]:
!pip install -U deep-translator

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep-translator
Successfully installed deep-translator-1.11.4


In [2]:
# Data handling and manipulation
import os,textwrap
import shutil
import json
import io as input_output
import pandas as pd
import numpy as np
import torch
import random
import datasets
from deep_translator import GoogleTranslator
from PIL import Image, UnidentifiedImageError
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split

In [3]:
is_apply_enhancement = False
is_translate = True
str_filter = None #'chest x-ray'
is_verify_image = False
data_dir = f'/kaggle/input/flickr30k/'
IMG_DIR = f'{data_dir}flickr30k_images'

In [4]:
# Pertama, pisahkan menjadi train dan test
df = pd.read_csv('/kaggle/input/flickr30k/captions.txt')
df = df[['image_name', 'comment']]
df.columns = ['Images', 'Caption']
df['Caption'] = df['Caption'].apply(lambda x: x.rstrip('.') if x.endswith('.') else x)
df_train, df_temp = train_test_split(df, test_size=0.2, random_state=42)

# Kemudian, pisahkan temp_df menjadi validation dan testing
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
print(len(df_train),len(df_val),len(df_test))
df

127132 15891 15892


Unnamed: 0,Images,Caption
0,1000092795.jpg,Two young guys with shaggy hair look at their ...
1,1000092795.jpg,Two young White males are outside near many b...
2,1000092795.jpg,Two men in green shirts are standing in a yard
3,1000092795.jpg,A man in a blue shirt standing in a garden
4,1000092795.jpg,Two friends enjoy time spent together
...,...,...
158910,998845445.jpg,A man in shorts and a Hawaiian shirt leans ove...
158911,998845445.jpg,A young man hanging over the side of a boat w...
158912,998845445.jpg,A man is leaning off of the side of a blue and...
158913,998845445.jpg,A man riding a small boat in a harbor with fo...


In [5]:
def verify_image(image_path):
    try:
        Image.open(image_path).verify()
        return True
    except UnidentifiedImageError:
        print(f"Error: The file at {image_path} is not a valid image.")
        return False
    except FileNotFoundError:
        print(f"Error: The file at {image_path} was not found.")
        return False
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return False

# Fungsi untuk menerjemahkan caption
def translate_caption(caption):
    try:
        translated = GoogleTranslator(source='auto', target='id').translate(caption)
        return translated
    except Exception as e:
        print(f"Error translating caption: {e}")
        return caption

# Fungsi untuk menerjemahkan caption secara paralel
def translate_caption_parallel(captions):
    with ThreadPoolExecutor(max_workers=4) as executor:  # Menyesuaikan jumlah worker sesuai kebutuhan
        translated = list(executor.map(translate_caption, captions))
    return translated

def preprocess_dataset(df,dataset_name,is_translate=False, str_filter=None):
#     df = df.sample(n=5, random_state=42).reset_index(drop=True)
    # df = df.sample(frac=0.0003, random_state=42).reset_index(drop=True)
    if str_filter is None:
        df = df.copy()
    else:
        mask = df['Caption'].str.contains(str_filter, case=False)
        df = df[mask].copy()
    if is_translate:
        df['Caption'] = translate_caption_parallel(df['Caption'])
        # df['Caption'] = df['Caption'].apply(translate_caption)
    df['Path'] = df['Images'].apply(lambda x: f"{IMG_DIR}/{x}")
    if is_verify_image:
        df = df[df['Path'].apply(lambda x: os.path.exists(x) and verify_image(x))]
    df.to_csv(f'{dataset_name}_data.csv', index=False)
    return df

df_train = preprocess_dataset(df_train, 'train', is_translate, str_filter)
df_val = preprocess_dataset(df_val, 'val', is_translate, str_filter)
df_test = preprocess_dataset(df_test, 'test', is_translate, str_filter)

Error translating caption: Response ended prematurely


In [6]:
print(len(df_train),len(df_val),len(df_test))
df_train.head()

127132 15891 15892


Unnamed: 0,Images,Caption,Path
0,3773310720.jpg,Seorang pria dengan warna coklat membangun rakit,/kaggle/input/flickr30k/flickr30k_images/37733...
1,4524418308.jpg,Sekelompok orang berkumpul di sekitar meja bun...,/kaggle/input/flickr30k/flickr30k_images/45244...
2,2904997007.jpg,Seorang anak laki -laki kecil melompat dari te...,/kaggle/input/flickr30k/flickr30k_images/29049...
3,4604410267.jpg,Seorang wanita di ponselnya berjalan di taman,/kaggle/input/flickr30k/flickr30k_images/46044...
4,2219959872.jpg,Seorang pria dan dua anak berdiri di depan ged...,/kaggle/input/flickr30k/flickr30k_images/22199...


In [7]:
df_val.head()

Unnamed: 0,Images,Caption,Path
0,3413571342.jpg,Seorang pria dalam batang berwarna -warni menj...,/kaggle/input/flickr30k/flickr30k_images/34135...
1,2214403949.jpg,Seorang wanita bermain dengan seorang anak di ...,/kaggle/input/flickr30k/flickr30k_images/22144...
2,3173215794.jpg,Orang -orang menunggu dalam barisan restoran y...,/kaggle/input/flickr30k/flickr30k_images/31732...
3,3621095412.jpg,Enam orang dewasa muda satu memegang seekor an...,/kaggle/input/flickr30k/flickr30k_images/36210...
4,4439092536.jpg,Seseorang yang mengenakan jaket biru dan topi ...,/kaggle/input/flickr30k/flickr30k_images/44390...


In [8]:
df_test.head()

Unnamed: 0,Images,Caption,Path
0,1989609.jpg,Seorang pria dengan kumis bekerja di eskalator...,/kaggle/input/flickr30k/flickr30k_images/19896...
1,2192131110.jpg,Gadis ini bersenang -senang di ayunan!,/kaggle/input/flickr30k/flickr30k_images/21921...
2,47871819.jpg,Gadis -gadis bermain sepak bola,/kaggle/input/flickr30k/flickr30k_images/47871...
3,4558172302.jpg,Seorang pria dan wanita yang berdiri di atas g...,/kaggle/input/flickr30k/flickr30k_images/45581...
4,2716744948.jpg,Seorang pria tanpa kemeja duduk di atas batu d...,/kaggle/input/flickr30k/flickr30k_images/27167...
