In [None]:
!pip install transformers sentencepiece
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.6.0-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.6.0-py3-none-any.whl (163 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.1/163.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-2.6.0


In [None]:
from transformers import pipeline
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import BartForConditionalGeneration, BartTokenizer
import pandas as pd
import numpy as np
from tqdm.auto import tqdm  # Use auto to select the best interface (notebook, terminal, etc.)
import gc


2024-03-23 12:34:16.953058: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-23 12:34:16.953197: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-23 12:34:17.119013: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
def read_process_df(path):
    news_mind = pd.read_csv(path, sep = '\t', header = None)
    news_mind.rename(columns = {0:"news_id", 1:"category", 2:"sub_category", 3:"title",4:"abstract"}, inplace = True)
    news_mind.drop(columns = ["abstract","category", "sub_category",5,6,7], inplace = True)
    return news_mind.drop_duplicates()

In [None]:
small_train_path = "/kaggle/input/mind-small-train/news.tsv"
small_val_path = "/kaggle/input/mind-small-val/news.tsv"
train_df = read_process_df(small_train_path)
print(train_df.shape)
val_df = read_process_df(small_val_path)
print(val_df.shape)
df = pd.concat([train_df, val_df], axis = 0).drop_duplicates().reset_index(drop = True)
df

(51282, 2)
(42416, 2)


Unnamed: 0,news_id,title
0,N55528,"The Brands Queen Elizabeth, Prince Charles, an..."
1,N19639,50 Worst Habits For Belly Fat
2,N61837,The Cost of Trump's Aid Freeze in the Trenches...
3,N53526,I Was An NBA Wife. Here's How It Affected My M...
4,N38324,"How to Get Rid of Skin Tags, According to a De..."
...,...,...
65233,N2292,House investigators release more impeachment t...
65234,N27291,Mural in Downtown S.F. Depicts Swedish Teen Cl...
65235,N52871,Residents of Mexican town struggle with fear a...
65236,N36658,Apartments for rent in Minneapolis: What will ...


**Parallel processing version (utilizes multiple GPUs):**

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import gc

class BackTranslator:
    def __init__(self, batch_size = 8):
        self.batch_size = batch_size
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.en_to_fr , self.fr_to_en = None, None
        self.en_to_ger , self.ger_to_en = None, None
        self.en_to_esp , self.esp_to_en = None, None
        self.en_to_arab , self.arab_to_en = None, None
        self.en_to_heb , self.heb_to_en = None, None
        self.models_cache = {}

    def _load_model_and_tokenizer(self, model_name):
        if model_name not in self.models_cache:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)
            self.models_cache[model_name] = (tokenizer, model)
        return self.models_cache[model_name]

    def transform(self, method, texts):
        if method == "helsinki":
            self.en_to_fr , self.fr_to_en = "Helsinki-NLP/opus-mt-en-fr", "Helsinki-NLP/opus-mt-fr-en"
            self.en_to_ger , self.ger_to_en = "Helsinki-NLP/opus-mt-en-de", "Helsinki-NLP/opus-mt-de-en"
            self.en_to_esp , self.esp_to_en = "Helsinki-NLP/opus-mt-en-es", "Helsinki-NLP/opus-mt-es-en"
            self.en_to_heb , self.heb_to_en = "Helsinki-NLP/opus-mt-en-he", "Helsinki-NLP/opus-mt-tc-big-he-en"
            paraphrased_batch_french = self._helsinki_back_translation_batch(texts,self.en_to_fr,self.fr_to_en )
            paraphrased_batch_german = self._helsinki_back_translation_batch(texts,self.en_to_ger,self.ger_to_en )
            paraphrased_batch_esp = self._helsinki_back_translation_batch(texts,self.en_to_esp,self.esp_to_en )
            paraphrased_batch_heb = self._helsinki_back_translation_batch(texts,self.en_to_heb,self.heb_to_en)
            return paraphrased_batch_french, paraphrased_batch_german, paraphrased_batch_esp, paraphrased_batch_heb
        elif method == "mbart":
            self.en_to_fr , self.fr_to_en = "en_XX", "fr_XX"
            self.en_to_ger , self.ger_to_en = "en_XX", "de_DE"
            self.en_to_esp , self.esp_to_en = "en_XX", "es_XX"
            self.en_to_arab , self.arab_to_en = "en_XX", "ar_AR"
            self.en_to_heb , self.heb_to_en = "en_XX", "he_IL"
            return self._mbart_back_translation(texts)
        else:
            raise ValueError("Unsupported back translation method")

    def _helsinki_back_translation_batch(self, texts, src_model_name, tgt_model_name):
        # Load tokenizer and model for source to target translation
        src_tokenizer, src_model = self._load_model_and_tokenizer(src_model_name)

        # Tokenize batch for source language
        src_encoded = src_tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512).to(self.device)
        # Generate translation in target language
        src_translated_tokens = src_model.generate(**src_encoded)
        src_translated_texts = src_tokenizer.batch_decode(src_translated_tokens, skip_special_tokens=True)

        # Load tokenizer and model for target to source back translation
        tgt_tokenizer, tgt_model = self._load_model_and_tokenizer(tgt_model_name)

        # Tokenize batch for target language
        tgt_encoded = tgt_tokenizer(src_translated_texts, padding=True, truncation=True, return_tensors="pt", max_length=512).to(self.device)
        # Generate back translation in source language
        tgt_translated_tokens = tgt_model.generate(**tgt_encoded)
        back_translated_texts = tgt_tokenizer.batch_decode(tgt_translated_tokens, skip_special_tokens=True)

        return back_translated_texts

    def process_batch(self,method, batch_texts):
            return self.transform(method, batch_texts)

    def augment_dataframe_parallel(self, df, df_name, column_name, method, num_workers=4):
        texts = df[column_name].tolist()
        augmented_texts_french = [None] * len(texts)
        augmented_texts_german = [None] * len(texts)
        augmented_texts_spanish = [None] * len(texts)
        augmented_texts_heb = [None] * len(texts)
        with ThreadPoolExecutor(max_workers=num_workers) as executor:
            future_to_batch = {executor.submit(self.process_batch, method, texts[i:i + self.batch_size]): i for i in range(0, len(texts), self.batch_size)}
            for future in as_completed(future_to_batch):
                batch_index = future_to_batch[future]
                try:
                    paraphrased_batch_french, paraphrased_batch_german, paraphrased_batch_spanish, paraphrased_batch_heb = future.result()
                    augmented_texts_french[batch_index:batch_index + self.batch_size] = paraphrased_batch_french
                    augmented_texts_german[batch_index:batch_index + self.batch_size] = paraphrased_batch_german
                    augmented_texts_spanish[batch_index:batch_index + self.batch_size] = paraphrased_batch_spanish
                    augmented_texts_heb[batch_index:batch_index + self.batch_size] = paraphrased_batch_heb
                except Exception as exc:
                    print(f"Batch {batch_index} generated an exception: {exc}")
                finally:
                    gc.collect()

        df["french_augmentation"] = augmented_texts_french
        df["german_augmentation"] = augmented_texts_german
        df["spanish_augmentation"] = augmented_texts_spanish
        df["hebrew_augmentation"] = augmented_texts_heb
        df.to_csv(f"augmneted_back_translated_{df_name}_{method}.csv", index = False)
        return df

In [None]:
torch.cuda.empty_cache()
gc.collect()

**Without parallel processing:**

In [None]:
import gc
class BackTranslator:
    def __init__(self, batch_size = 8, num_return_sequences = 5):
        self.batch_size = batch_size
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.en_to_fr , self.fr_to_en = None, None
        self.en_to_ger , self.ger_to_en = None, None
        self.en_to_esp , self.esp_to_en = None, None
        self.en_to_arab , self.arab_to_en = None, None
        self.en_to_heb , self.heb_to_en = None, None
        self.num_return_sequences = num_return_sequences
        self.models_cache = {}

    def _load_model_and_tokenizer(self, model_name):
        if model_name not in self.models_cache:
            if "mbart" in model_name:
                print("hi")
                tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
                model = MBartForConditionalGeneration.from_pretrained(model_name).to(self.device)
            else:
                tokenizer = AutoTokenizer.from_pretrained(model_name)
                model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)
            self.models_cache[model_name] = (tokenizer, model)
        return self.models_cache[model_name]

    def transform(self, method, texts):
        if method == "helsinki":
            self.en_to_fr , self.fr_to_en = "Helsinki-NLP/opus-mt-en-fr", "Helsinki-NLP/opus-mt-fr-en"
#             self.en_to_ger , self.ger_to_en = "Helsinki-NLP/opus-mt-en-de", "Helsinki-NLP/opus-mt-de-en"
#             self.en_to_esp , self.esp_to_en = "Helsinki-NLP/opus-mt-en-es", "Helsinki-NLP/opus-mt-es-en"
#             self.en_to_heb , self.heb_to_en = "Helsinki-NLP/opus-mt-en-he", "Helsinki-NLP/opus-mt-tc-big-he-en"
            paraphrased_batch_french = self._helsinki_back_translation_batch(texts,self.en_to_fr,self.fr_to_en )
#             paraphrased_batch_german = self._helsinki_back_translation_batch(texts,self.en_to_ger,self.ger_to_en )
#             paraphrased_batch_esp = self._helsinki_back_translation_batch(texts,self.en_to_esp,self.esp_to_en )
#             paraphrased_batch_heb = self._helsinki_back_translation_batch(texts,self.en_to_heb,self.heb_to_en)
            return paraphrased_batch_french
        elif method == "mbart":
            self.en_to_fr , self.fr_to_en = "en_XX", "fr_XX"
            self.en_to_ger , self.ger_to_en = "en_XX", "de_DE"
            self.en_to_esp , self.esp_to_en = "en_XX", "es_XX"
            self.en_to_arab , self.arab_to_en = "en_XX", "ar_AR"
            self.en_to_heb , self.heb_to_en = "en_XX", "he_IL"
            return self._mbart_back_translation(texts, "facebook/mbart-large-50-many-to-many-mmt")
        else:
            raise ValueError("Unsupported back translation method")

    def _helsinki_back_translation_batch(self, texts, src_model_name, tgt_model_name):
        # Load tokenizer and model for source to target translation
        src_tokenizer, src_model = self._load_model_and_tokenizer(src_model_name)

        # Tokenize batch for source language
        src_encoded = src_tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512).to(self.device)
        # Generate translation in target language
        src_translated_tokens = src_model.generate(**src_encoded)
        src_translated_texts = src_tokenizer.batch_decode(src_translated_tokens, skip_special_tokens=True)

        # Load tokenizer and model for target to source back translation
        tgt_tokenizer, tgt_model = self._load_model_and_tokenizer(tgt_model_name)

        # Tokenize batch for target language
        tgt_encoded = tgt_tokenizer(src_translated_texts, padding=True, truncation=True, return_tensors="pt", max_length=512).to(self.device)
        # Generate back translation in source language
        tgt_translated_tokens = tgt_model.generate(**tgt_encoded)
        back_translated_texts = tgt_tokenizer.batch_decode(tgt_translated_tokens, skip_special_tokens=True)

        return back_translated_texts


    def _mbart_back_translation(self, texts, model_name):
        tokenizer, model = self._load_model_and_tokenizer(model_name)

        src_lang = "en_XX"
#         tgt_langs = ["fr_XX", "de_DE", "es_XX", "ar_AR", "he_IL"]
        tgt_lang = "fr_XX"
#         augmented_texts = []
#         for tgt_lang in tgt_langs:
        tokenizer.src_lang = src_lang
        encoded_en = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512).to(self.device)
        generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
                                   num_return_sequences=5,
                                   num_beams=5,
                                   max_length=512)
        translated_to_target = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

        tokenizer.src_lang = tgt_lang
        encoded_target = tokenizer(translated_to_target, padding=True, truncation=True, return_tensors="pt", max_length=512).to(self.device)
        generated_tokens_back = model.generate(**encoded_target, forced_bos_token_id=tokenizer.lang_code_to_id[src_lang],
                                   num_return_sequences=5,
                                   num_beams=5,
                                   max_length=512)
        back_translated_to_english = tokenizer.batch_decode(generated_tokens_back, skip_special_tokens=True)
#         augmented_texts.append(back_translated_to_english)

        augmentations_per_txt = [back_translated_to_english[i:i + self.num_return_sequences]
                                 for i in range(0, len(back_translated_to_english), self.num_return_sequences)]
        # a transpose operation:
        augmentations_per_column = [list(group) for group in zip(*augmentations_per_txt)]
        return augmentations_per_column

    def augment_dataframe(self, df, df_name, column_name, method):
        texts = df[column_name].tolist()
        gc_indicator = 0
        for j in range (self.num_return_sequences):
            df[f"{method}_augmentation_{j}"] = ""
        for i in tqdm(range(0, len(texts), self.batch_size), desc="Processing batches"):
            batch_texts = texts[i:i+self.batch_size]
            paraphrased_batch_columns= self.transform(method, batch_texts)
#             print(len())
            for j in range (self.num_return_sequences):
                df.loc[i:i + len(paraphrased_batch_columns[j]) - 1, f"{method}_augmentation_{j}"] = paraphrased_batch_columns[j]
#             paraphrased_texts_french.extend(paraphrased_batch_french)
            gc_indicator+=1
            if gc_indicator % 10 == 0:
                # Save only the processed portion of the DataFrame to avoid excessive file size during early saves
                if i + self.batch_size < len(texts):
                    df[:i + self.batch_size].to_csv(f"incremental_augmented_{df_name}_{method}_{gc_indicator // 10}.csv", index=False)
                else:
                    df.to_csv(f"incremental_augmented_{df_name}_{method}_{gc_indicator // 10}.csv", index=False)
                gc.collect()
        df.to_csv(f"augmneted_paraphrased_{df_name}_{method}.csv", index = False)
        return df


In [None]:
back_translator = BackTranslator(batch_size=64)
augmented_df = back_translator.augment_dataframe(df, "back_french_second_run",'title', "helsinki")
augmented_df

In [None]:
back_translator = BackTranslator(batch_size=16)
augmented_df = back_translator.augment_dataframe(df, "back_french_mbart",'text', "mbart")
augmented_df

**Praphraser Module:**

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

class Paraphraser:
    def __init__(self, num_return_sequences=5, num_beams=5, max_length=128, batch_size=8):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.num_return_sequences = num_return_sequences
        self.num_beams = num_beams
        self.max_length = max_length
        self.batch_size = batch_size
        self.models_cache = {}

    def _load_model_and_tokenizer(self, model_name):
        if model_name not in self.models_cache:
            if "pegasus" in model_name:
                tokenizer = PegasusTokenizer.from_pretrained(model_name)
                model = PegasusForConditionalGeneration.from_pretrained(model_name).to(self.device)
                print("adding pegasus to dict")
                self.models_cache[model_name] = (tokenizer, model)
            else:
                tokenizer = AutoTokenizer.from_pretrained(model_name)
                model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)
                self.models_cache[model_name] = (tokenizer, model)
                print(f"adding {model_name} to dict")
        return self.models_cache[model_name]

    def transform(self, texts, method):
        if method == "bart":
            return self._bart_paraphrase(texts)
        elif method == "chatgpt_t5":
            return self._chatgpt_t5_paraphrase(texts)
        elif method == "pegasus":
            return self._pegasus_paraphrase(texts)
        else:
            raise ValueError(f"Unsupported paraphrase method: {method}")

    def _bart_paraphrase(self, texts):
        tokenizer, model = self._load_model_and_tokenizer('eugenesiow/bart-paraphrase')

        batch = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=self.max_length).to(self.device)
        generated = model.generate(batch['input_ids'],
                                   num_return_sequences=self.num_return_sequences,
                                   num_beams=self.num_beams,
                                   max_length=self.max_length,
                                   temperature=1.5)
        paraphrases = tokenizer.batch_decode(generated, skip_special_tokens=True)

        if self.num_return_sequences == 1:
            return paraphrases
        else:
            augmentations_per_txt = [paraphrases[i:i + self.num_return_sequences]
                                     for i in range(0, len(paraphrases), self.num_return_sequences)]
            # a transpose operation:
            augmentations_per_column = [list(group) for group in zip(*augmentations_per_txt)]
            return augmentations_per_column

    def _pegasus_paraphrase(self, texts):
        tokenizer, model = self._load_model_and_tokenizer('tuner007/pegasus_paraphrase')

        batch = tokenizer(texts, padding="longest", truncation=True,
                          max_length=256, return_tensors="pt").to(self.device)
        success = True
        try:
            generated = model.generate(batch["input_ids"],
                                       max_length=256,
                                       num_beams=self.num_beams,
                                       num_return_sequences=self.num_return_sequences,
                                       temperature=1,
                                      do_sample = True)
            paraphrases = tokenizer.batch_decode(generated, skip_special_tokens=True)
        except:
            success = False

        if self.num_return_sequences == 1:
            return paraphrases
        else:
            if success:
                augmentations_per_txt = [paraphrases[i:i + self.num_return_sequences]
                                         for i in range(0, len(paraphrases), self.num_return_sequences)]
            else:
                print("not succeed anyway")
                print(batch['input_ids'].size(1))
                augmentations_per_txt = []
                for text in texts:
                    for i in range (self.num_return_sequences):
                        augmentations_per_txt.append(text)
            # a transpose operation:
            augmentations_per_column = [list(group) for group in zip(*augmentations_per_txt)]
            return augmentations_per_column

    def _chatgpt_t5_paraphrase(self, texts):
        tokenizer, model = self._load_model_and_tokenizer("humarin/chatgpt_paraphraser_on_T5_base")

        paraphrase_prompts = [f"paraphrase: {text}" for text in texts]

        batch = tokenizer(paraphrase_prompts, return_tensors="pt", padding="longest", truncation=True,
                          max_length=self.max_length).to(self.device)
        generated = model.generate(
            batch['input_ids'],
            max_length=self.max_length,
            num_beams=self.num_beams,
            num_return_sequences=self.num_return_sequences,
            temperature=0.7,
            num_beam_groups=5,
            repetition_penalty=10.0,
            diversity_penalty=3.0,
            no_repeat_ngram_size=2,
        )

        paraphrases = tokenizer.batch_decode(generated, skip_special_tokens=True)

        if self.num_return_sequences == 1:
            return paraphrases
        else:
            augmentations_per_txt = [paraphrases[i:i + self.num_return_sequences]
                                     for i in range(0, len(paraphrases), self.num_return_sequences)]
            # a transpose operation:
            augmentations_per_column = [list(group) for group in zip(*augmentations_per_txt)]
            return augmentations_per_column

    def augment_dataframe(self, df, df_name, column_name, method):
        texts = df[column_name].tolist()
        gc_indicator = 0
        for j in range (self.num_return_sequences):
            df[f"{method}_augmentation_{j}"] = ""
        for i in tqdm(range(0, len(texts), self.batch_size), desc="Processing batches"):
            batch_texts = texts[i:i+self.batch_size]
            paraphrased_batch_columns= self.transform(batch_texts, method)
#             print(len())
            for j in range (self.num_return_sequences):
                df.loc[i:i + len(paraphrased_batch_columns[j]) - 1, f"{method}_augmentation_{j}"] = paraphrased_batch_columns[j]
#             paraphrased_texts_french.extend(paraphrased_batch_french)
            gc_indicator+=1
            if gc_indicator % 10 == 0:
                # Save only the processed portion of the DataFrame to avoid excessive file size during early saves
                if i + self.batch_size < len(texts):
                    df[:i + self.batch_size].to_csv(f"incremental_augmented_{df_name}_{method}_{gc_indicator // 10}.csv", index=False)
                else:
                    df.to_csv(f"incremental_augmented_{df_name}_{method}_{gc_indicator // 10}.csv", index=False)
                gc.collect()
        df.to_csv(f"augmneted_paraphrased_{df_name}_{method}.csv", index = False)
        return df

In [None]:
torch.cuda.empty_cache()
gc.collect()

804

In [1]:
paraphraser = Paraphraser(batch_size=32,num_beams=5, max_length=128,num_return_sequences=5  )
# paraphraser.augment_dataframe(df, "pegasus", "title","pegasus")
paraphraser.augment_dataframe(df, "chatgpt_t5", "title","chatgpt_t5")
df

In [None]:
augmented_df.to_csv("final_T5_gpt.csv", index = False)