<a href="https://colab.research.google.com/github/Hariom329/PraquetDatasetManipulation/blob/main/ParquetDatasetTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install googletrans==3.1.0a0

Collecting googletrans==3.1.0a0
  Downloading googletrans-3.1.0a0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==3.1.0a0)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading hstspreload-2024.8.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googletrans==3.1.0a0

In [None]:
import os
import pandas as pd
import configparser
from googletrans import Translator

# Set the working directory
os.chdir('/content')

# Configuration settings
source_dir_path = "source"
translated_dir_path = "translated"
config = configparser.ConfigParser(default_section="DATASETS")

# Create directories if they don't exist
os.makedirs(source_dir_path, exist_ok=True)
os.makedirs(translated_dir_path, exist_ok=True)


train_data_path = "/content/sample_data/train-00000-of-00001-4fe2df04669d1669.parquet"

print(f"Attempting to load file from: {train_data_path}")

if os.path.exists(train_data_path):
    print("File found, proceeding with loading...")
    df_1 = pd.read_parquet(train_data_path)
    print("File loaded successfully.")
    print(df_1.head())
else:
    print("File not found, please check the file path.")
    raise FileNotFoundError(f"Cannot find file: {train_data_path}")

input_field = "instruction"
output_field = "output"

df_1 = df_1.rename(columns={
    input_field: "question",
    output_field: "answer"
})

translated_dialect = "hi"
checkpoint_file = "checkpoint.txt"

def store_sft_dataset(name_of_dataset, data_frame, split_type):
    """
    Method to store the dataset in a CSV file and optionally translate it.
    """
    file_name = str(os.path.join(source_dir_path, name_of_dataset.replace("/", "_"))) + f"_{split_type}.csv"
    if not os.path.isfile(file_name):
        print("Opening file.....", file_name)
        data_frame.to_csv(file_name, encoding="utf-8", index=False, header=True)
        print("Finished writing file....", file_name)

    # Translate dataset into different dialects
    translate_in_dialects(name_of_dataset, data_frame, split_type, translated_dialect)

# Function to handle translation into different dialects
def translate_in_dialects(name_of_dataset, data_frame, split_type, dialect_name="hi"):
    chunk_size = 10000
    count = 0
    total_rows = len(data_frame)

    # Load checkpoint
    start_row = 0
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r") as f:
            start_row = int(f.read().strip())

    print(f"Total rows to translate: {total_rows}")
    print(f"Starting from row: {start_row}")

    print("Translating now....")
    translator = Translator()

    for chunk_start in range(start_row, total_rows, chunk_size):
        chunk_end = min(chunk_start + chunk_size, total_rows)
        translate_df = pd.DataFrame(columns=["question", "answer"])
        translated_append_list = []

        for index in range(chunk_start, chunk_end):
            val = data_frame.iloc[index]
            print(f"Translating row {index + 1} of {total_rows}")
            translated_ques = translator.translate(val["question"], dest=dialect_name).text
            translated_ans = translator.translate(val["answer"], dest=dialect_name).text
            translated_append_list.append({'question': translated_ques, 'answer': translated_ans})
            count += 1

        df = pd.concat([translate_df, pd.DataFrame(translated_append_list)])
        translated_file_name = str(os.path.join(
            translated_dir_path,
            name_of_dataset.replace("/", "_")
        )) + f"{split_type}_{dialect_name}_translated_{chunk_start}-{chunk_end}.csv"

        if not os.path.isfile(translated_file_name):
            print("Opening file.....", translated_file_name)
            df.to_csv(translated_file_name, encoding="utf-8", index=False, header=True)
            print("Finished writing file....", translated_file_name)

        with open(checkpoint_file, "w") as f:
            f.write(str(chunk_end))

# Processing and storage of the dataset
store_sft_dataset("OpenPlatypus", df_1, "train")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Translating row 19931 of 24926
Translating row 19932 of 24926
Translating row 19933 of 24926
Translating row 19934 of 24926
Translating row 19935 of 24926
Translating row 19936 of 24926
Translating row 19937 of 24926
Translating row 19938 of 24926
Translating row 19939 of 24926
Translating row 19940 of 24926
Translating row 19941 of 24926
Translating row 19942 of 24926
Translating row 19943 of 24926
Translating row 19944 of 24926
Translating row 19945 of 24926
Translating row 19946 of 24926
Translating row 19947 of 24926
Translating row 19948 of 24926
Translating row 19949 of 24926
Translating row 19950 of 24926
Translating row 19951 of 24926
Translating row 19952 of 24926
Translating row 19953 of 24926
Translating row 19954 of 24926
Translating row 19955 of 24926
Translating row 19956 of 24926
Translating row 19957 of 24926
Translating row 19958 of 24926
Translating row 19959 of 24926
Translating row 19960 of 24926
Trans