In [None]:
import torch
import asyncio
from concurrent.futures import ThreadPoolExecutor
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd
import numpy as np
import torch
import asyncio

import nest_asyncio

# Apply nest_asyncio to allow nested use of asyncio.run()
nest_asyncio.apply()
# Load dataset
dataset = pd.read_parquet('T5_allocate.parquet', engine='fastparquet')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define model paths
model_paths = ["Hyeonsieun/GTtoNT_addmoretoken_ver2"] * 25


# Load models and tokenizers
models = []
tokenizers = []
for path in model_paths:
    tokenizer = T5Tokenizer.from_pretrained(path)
    model = T5ForConditionalGeneration.from_pretrained(path).to(device)
    model.eval()
    models.append(model)
    tokenizers.append(tokenizer)



In [None]:

async def do_correction_3(text, model, tokenizer):
    input_text = f"translate the LaTeX equation to a text pronouncing the formula: {text}"
    inputs = tokenizer.encode(
        input_text,
        return_tensors='pt',
        max_length=325,
        padding='max_length',
        truncation=True
    ).to(device)

    corrected_ids = model.generate(
        inputs,
        max_length=325,
        num_beams=5,
        early_stopping=True
    )

    corrected_sentence = tokenizer.decode(
        corrected_ids[0],
        skip_special_tokens=False
    )
    start_index = corrected_sentence.find("<pad>") + len("<pad>")
    end_index = corrected_sentence.find("</s>")
    corrected_sentence = corrected_sentence[start_index:end_index].strip()
    corrected_sentence = corrected_sentence.replace("<unk>", "")
    return corrected_sentence


In [None]:
import torch
import asyncio
from concurrent.futures import ThreadPoolExecutor
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd
import numpy as np
import torch
import asyncio

import nest_asyncio

In [None]:
nest_asyncio.apply()
def process_batch(start_idx, end_idx, model, tokenizer, dataset, batch_num):
    NT = []
    for i in range(start_idx, end_idx):
        TeX = dataset['equation'][i]
        print(f"{i} input : {TeX}")
        NT_result = asyncio.run(do_correction_3(TeX, model, tokenizer))
        print(f"{i} result : {NT_result}")
        NT.append(NT_result)
        if len(NT) >= 200:
            # Save intermediate results
            save_partial_results(NT, batch_num, i)
            NT = []
    # Save any remaining results
    if NT:
        save_partial_results(NT, batch_num, end_idx - 1)

def save_partial_results(NT, batch_num, end_idx):
    df_partial = pd.DataFrame(NT, columns=['spoken_English'])
    filename = f"parquets/NT_results_batch{batch_num}_part{end_idx}.parquet"
    df_partial.to_parquet(filename, index=False, engine='fastparquet')
    print(f"Saved partial results to {filename}")

def main():
    # Split the dataset indices for each model
    num_samples = len(dataset)
    num_models = len(models)
    batch_size = num_samples // num_models

    with ThreadPoolExecutor(max_workers=num_models) as executor:
        futures = []
        for i in range(num_models):
            start_idx = i * batch_size
            end_idx = (i + 1) * batch_size if i != num_models - 1 else num_samples
            futures.append(executor.submit(process_batch, start_idx, end_idx, models[i], tokenizers[i], dataset, i))
        
        # Wait for all futures to complete
        for future in futures:
            future.result()

if __name__ == "__main__":
    main()