Important:

There's a conflict. `googletrans==4.0.0-rc1` was the version used, but it requires `httpx==0.13.3`, while our supabase db requires `httpx==0.27.2`.

In [None]:
!pip install googletrans==4.0.0-rc1
!pip install datasets==3.0.1

In [None]:
import pandas as pd
from googletrans import Translator
import time
from datasets import load_dataset
import os

crypto_dataset is `train.csv` in 

https://www.kaggle.com/datasets/amalrajsingh/cryptocurrency-blockchain-and-stock-market-qa?resource=download&select=train.csv

In [None]:
path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'fine_tuning_data', 'crypto_dataset'))

In [None]:
df_crypto = pd.read_csv(path)
df_crypto.head()

In [None]:
df_crypto.info()

In [None]:
df_crypto.drop(columns=["Unnamed: 0"], inplace=True)
duplicated_rows = df_crypto[df_crypto.duplicated()]
print(len(duplicated_rows))
df_crypto.drop_duplicates(inplace=True)
df_crypto.reset_index(drop=True, inplace=True)

In [None]:
df_crypto.info()

In [None]:
nan_count_per_column = df_crypto.isna().sum()

print(f"nan count \n{nan_count_per_column}")
print("")

print(f"empty count")
for column in df_crypto.columns:
  print(f"{column}: {(df_crypto[column] == '').sum()}")
print("")

print(f"whitespace count")
for column in df_crypto.columns:
  print(f"{column}: {(df_crypto[column].str.strip() == '').sum()}") 


In [None]:
translator = Translator()

def safe_translate(text, src='en', dest='pt', column=None, row=None):
  try:
    # time.sleep(0.5)
    translated_text = translator.translate(text, src=src, dest=dest).text
    return translated_text
  
  except Exception as e:
    print(f"({row}, {column})")
    print(f"Error translating text: {text} - {e}")
    return None

untranslatable_a = []
untranslatable_q = []
df_crypto['question_pt'] = df_crypto.apply(lambda row: safe_translate(row['question'], src='en', dest='pt', column='question', row=row.name), axis=1)
df_crypto['answer_pt'] = df_crypto.apply(lambda row: safe_translate(row['answer'], src='en', dest='pt', column='answer', row=row.name), axis=1)

In [None]:
df_crypto.reset_index(drop=True, inplace=True)

Google Translator API is too slow.

Sometimes it can't handle the amount of requests or just can't deal with a specific request.

You gonna notice that after the first try to safe_translate, there will be some None values.

You need to try to translate them again.

### https://github.com/vTuanpham/Large_dataset_translator?tab=readme-ov-file

For those who have time to automate this code, checkout this repo, it is a **dataset translator**.
#### More faster and more reliable.

In [None]:
untranslatable_q = None # define this as a list with each index that was printed for 'question' column
untranslatable_a = None # define this as list with each index that was printed for 'answer' column

Rerun those below while is necessary.

In [None]:
# Retry for untranslatable questions
for row_index in untranslatable_q:
    df_crypto.loc[row_index, 'question_pt'] = safe_translate(df_crypto.loc[row_index, 'question'], src='en', dest='pt', column='question', row=row_index)

In [None]:
# Retry for untranslatable answers
for row_index in untranslatable_a:
    df_crypto.loc[row_index, 'answer_pt'] = safe_translate(df_crypto.loc[row_index, 'answer'], src='en', dest='pt', column='answer', row=row_index)

In [None]:
# while the every column value isn't 0, you should still be retrying safe_translate or use the repository I recommended or another way.
df_crypto.isna().sum()

In [None]:
df_crypto[['question_pt', 'answer_pt']].head()

In [None]:
df_crypto.to_csv('preprocessed_crypto_finance.csv', index=False)

In [None]:
dataset = load_dataset('nihiluis/financial-advisor-100')
df_nihilus = pd.DataFrame(dataset['train'])

In [None]:
df_nihilus.info()

In [None]:
df_nihilus.drop(columns=['id'], inplace=True)

In [None]:
nan_count_per_column = df_nihilus.isna().sum()

print(f"nan count \n{nan_count_per_column}")
print("")

print(f"empty count")
for column in df_nihilus.columns:
  print(f"{column}: {(df_nihilus[column] == '').sum()}")
print("")

print(f"whitespace count")
for column in df_nihilus.columns:
  print(f"{column}: {(df_nihilus[column].str.strip() == '').sum()}") 


In [None]:
df_nihilus.drop_duplicates(inplace=True)
df_nihilus.reset_index(drop=True, inplace=True)

In [None]:
df_nihilus['question_pt'] = df_nihilus.apply(lambda row: safe_translate(row['question'], src='en', dest='pt', column='question', row=row.name), axis=1)
df_nihilus['answer_pt'] = df_nihilus.apply(lambda row: safe_translate(row['answer'], src='en', dest='pt', column='answer', row=row.name), axis=1)

In [None]:
untranslatable_q_nihilus = [91]
untranslatable_a_nihilusM = [21, 40]

In [None]:
for row_index in untranslatable_q:
    df_nihilus.loc[row_index, 'question_pt'] = safe_translate(df_nihilus.loc[row_index, 'question'], src='en', dest='pt', column='question', row=row_index)

In [None]:
for row_index in untranslatable_a:
    df_nihilus.loc[row_index, 'answer_pt'] = safe_translate(df_nihilus.loc[row_index, 'answer'], src='en', dest='pt', column='answer', row=row_index)

In [None]:
df_nihilus.reset_index(drop=True, inplace=True)

In [None]:
df_nihilus.to_csv('preprocessed_financial_advisor_100.csv', index=False)

In [None]:
df_1 = df_crypto[['question_pt', 'answer_pt']]
df_2 = df_nihilus[['question_pt', 'answer_pt']]
df_combined = pd.concat([df_1, df_2], ignore_index=True)
df_combined.rename(columns={'question_pt': 'question', 'answer_pt': 'answer'}, inplace=True)

In [None]:
df_combined = df_combined.sample(frac=1).reset_index(drop=True)

In [None]:
def create_message_format(row):
    return [
        {'role': 'user', 'content': row['question']},
        {'role': 'assistant', 'content': row['answer']}
    ]

In [None]:
df_new_format = pd.DataFrame({
    'messages': df_combined.apply(create_message_format, axis=1)
})

In [None]:
df_new_format.to_json('new_format.jsonl', orient='records', lines=True)