## Print Start time

In [None]:
from utils import print_time

print_time.print_("Start-Time")

## Specify Mode

In [None]:
deploying = False

## Specify Model

In [None]:
# model_checkpoint = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base'
# model_checkpoint = 'bert-large-uncased'
# model_checkpoint = 'xlnet-base-cased'
model_checkpoint = 'xlnet-large-cased'
# model_checkpoint = 'xlm-roberta-large'
# model_checkpoint = 'microsoft/deberta-v2-xxlarge'

## Load df

In [None]:
from utils import preprocessing

df, df_test = preprocessing.preprocess_data(deploying=deploying,
                                            train_path='data/SMM4H_2024_Task3_Training_1800.csv',
                                            val_path='data/SMM4H_2024_Task3_Validation_600.csv',
                                            test_path='data/SMM4H_Task3_testposts.csv',
                                            model_checkpoint=model_checkpoint)

## Split data

In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, y_train, y_val = train_test_split(
    df['text'], df['label'],
    test_size=0.3, random_state=42
)

test_texts = df_test['text']

## Get train_df

In [None]:
train_df = pd.DataFrame({'text': train_texts, 'label': y_train})

# Contar el número de publicaciones en cada categoría
class_counts = train_df['label'].value_counts()
print("Class distribution before augmenting with paraphrased texts:\n", class_counts)

## Backtranslate

In [None]:
backtranslate = False

if backtranslate:

    from utils import backtranslation

    for label in {1, 3}:
        print(f"Backtranslating class {label}...")
        # Backtranslate and augment the data for underrepresented classes
        selected_texts = train_df[train_df['label'] == label]['text']
        selected_keywords = train_df[train_df['label'] == label]['keyword']
        print(f"length texts of label {label}", len(selected_texts))
        augmented_texts = backtranslation.backtranslate_t5(selected_texts.to_list(), selected_keywords.to_list())
        augmented_df = pd.DataFrame({'text': augmented_texts, 'label': [label] * len(augmented_texts)})
        augmented_df.to_csv(f'data/augmented_train_dfs/backtranslated_t5_class_{label}.csv', index=False)
        train_df = pd.concat([train_df, augmented_df])

    # Check the new class distribution after backtranslation
    print("Class distribution after backtranslation:", train_df['label'].value_counts())

    # Save the augmented training dataframe to a CSV file
    train_df_path = 'data/augmented_train_dfs/train_df_plus_backtranslated_class_1_3.csv'
    train_df.to_csv(train_df_path, index=False)


## Paraphrase

In [None]:
paraphrase = False
# Save the augmented training dataframe to a CSV file
# train_df_path = 'data/augmented_train_dfs/train_df_plus_paraphased_class_1_3.csv'

if paraphrase:

    from utils import paraphrase_humarin

    for label in {1, 2, 3}:
        print(f"Paraphrasing class {label}...")
        # Paraphrase and augment the data for underrepresented classes
        selected_texts = train_df.loc[train_df['label'] == label, 'text']
        selected_keywords = train_df.loc[train_df['label'] == label, 'keyword']
        print(f"length texts of label {label}", len(selected_texts))
        augmented_texts = paraphrase_humarin.paraphrase(selected_texts.to_list())
        # augmented_texts = [["t1", "t2", "t3", "t4"], ["t12", "t22", "t32", "t42"]]
        for i in range(len(augmented_texts[0])):
            print("i", i)
            curr_texts = [augmented_texts[j][i] for j in range(len(augmented_texts))]
            print(curr_texts)
            augmented_df = pd.DataFrame({'text': curr_texts, 'label': [label] * len(curr_texts), 'keyword': selected_keywords.to_list()})
            # augmented_df = pd.DataFrame({'text': curr_texts, 'label': [label] * len(curr_texts)})
            augmented_df.to_csv(f'data/augmented_dfs_train/Paraphrase{i+1}/paraphrased_class_{label}.csv', index=False)
        # train_df = pd.concat([train_df, augmented_df])

    # Check the new class distribution after paraphrasing
    # print("Class distribution after paraphrasing:", train_df['label'].value_counts())

    # train_df.to_csv(train_df_path, index=False)
    print("\nDone paraphrasing\n")

## Traditional Augmentation

In [None]:
if augmenting:

    from utils import punct_insertion, random_deletion, random_swap, random_insertion

    print("Starting traditional augmentation...")

    train_df['text_punct_insertion'] = train_df['text'].apply(punct_insertion.insert_punctuation)
    train_df['text_random_deletion'] = train_df['text'].apply(random_deletion.rnd_del)
    train_df['text_random_swap'] = train_df['text'].apply(random_swap.rnd_swap)
    train_df['text_random_insertion'] = train_df['text'].apply(random_insertion.rnd_insert)

    punct_df = train_df[['text_punct_insertion', 'label', 'keyword']].copy()
    # rename text_punct_insertion to text
    punct_df.rename(columns={'text_punct_insertion': 'text'}, inplace=True)

    rnd_del_df = train_df[['text_random_deletion', 'label', 'keyword']].copy()
    # rename text_random_deletion to text
    rnd_del_df.rename(columns={'text_random_deletion': 'text'}, inplace=True)

    rnd_swap_df = train_df[['text_random_swap', 'label', 'keyword']].copy()
    # rename text_random_swap to text
    rnd_swap_df.rename(columns={'text_random_swap': 'text'}, inplace=True)

    rnd_insert_df = train_df[['text_random_insertion', 'label', 'keyword']].copy()
    # rename text_random_insertion to text
    rnd_insert_df.rename(columns={'text_random_insertion': 'text'}, inplace=True)

    # Save the augmented training dataframe to a CSV file
    punct_df.to_csv('data/traditional_augmentation_train/punct_df1.csv', index=False)
    rnd_del_df.to_csv('data/traditional_augmentation_train/rnd_del_df1.csv', index=False)
    rnd_swap_df.to_csv('data/traditional_augmentation_train/rnd_swap_df1.csv', index=False)
    rnd_insert_df.to_csv('data/traditional_augmentation_train/rnd_insert_df1.csv', index=False)

    print("Traditional augmentation done...")
