## Print Start time

In [1]:
from utils import print_time

print_time.print_("Start-Time")

------------------------------------------------
Start-Time
2024-04-17 13:51:28
------------------------------------------------


## Specify modes

In [3]:
deploying = False
if deploying:
    print("----------Deploying----------")

paraphrase_aug = True
traditional_aug = False
undersampling = False

## Specify Model

In [4]:
model_checkpoint = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base'
# model_checkpoint = 'bert-large-uncased'
# model_checkpoint = 'xlnet-base-cased'
# model_checkpoint = 'xlnet-large-cased'
# model_checkpoint = 'xlm-roberta-large'
# model_checkpoint = 'microsoft/deberta-v2-xxlarge'

## Load df

In [6]:
from utils import preprocessing

df, df_test = preprocessing.preprocess_data(deploying=deploying,
                                            train_path='data/SMM4H_2024_Task3_Training_1800.csv',
                                            val_path='data/SMM4H_2024_Task3_Validation_600.csv',
                                            test_path='data/SMM4H_Task3_testposts.csv',
                                            model_checkpoint=model_checkpoint)

Reading data...
Data read...
           id           keyword  \
0      3u2w5k               run   
1      3xbury           outside   
2      3y743u  run, swim, climb   
3      43bvs7              walk   
4      442ap2           outside   
...       ...               ...   
2395  edvs552              walk   
2396   ee31pf           outside   
2397  eei3pz3     outside, walk   
2398  eek8bpk           outside   
2399  eeljxq0           outside   

                                                   text  label  
0     Afterwards, I want to make a run at young love...      0  
1     I've met her and several other girls on a spec...      0  
2     Anyway! I'd also like to be able to talk to pe...      0  
3     I did and again I failed to utter a word. I go...      0  
4     Anyway I ran back inside sort of grunting to a...      0  
...                                                 ...    ...  
2395   The thought of applying for a job terrified m...      2  
2396  I will forever think peo

## Split data

In [8]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, y_train, y_val = train_test_split(
    df['text'], df['label'],
    test_size=0.3, random_state=42
)

test_texts = df_test['text']
if not deploying:
    y_test = df_test['label']

## Get train_df

In [9]:
import pandas as pd
train_df = pd.DataFrame({'text': train_texts, 'label': y_train})

# Contar el número de publicaciones en cada categoría
class_counts = train_df['label'].value_counts()
print("Class distribution before augmenting with paraphrased texts:\n", class_counts)

Class distribution before cutting:
 label
0    796
2    265
1    116
3     83
Name: count, dtype: int64


## Augment train_df by augmented texts

In [10]:
import os

if deploying:
    paraphrase_path = "data/augmented_dfs_trainval/"
    aug_path = "data/traditional_augmentation_trainval/"
else:
    paraphrase_path = "data/augmented_dfs_train/"
    aug_path = "data/traditional_augmentation_train/"
    
if paraphrase_aug:
    paraphrased_1_df_1 = pd.read_csv(paraphrase_path + 'Paraphrase1/paraphrased_class_1.csv', usecols=['text', 'label', 'keyword'])
    paraphrased_1_df_2 = pd.read_csv(paraphrase_path + 'Paraphrase2/paraphrased_class_1.csv', usecols=['text', 'label', 'keyword'])
    paraphrased_1_df_3 = pd.read_csv(paraphrase_path + 'Paraphrase3/paraphrased_class_1.csv', usecols=['text', 'label', 'keyword'])
    
    paraphrased_2_df_1 = pd.read_csv(paraphrase_path + 'Paraphrase1/paraphrased_class_2.csv', usecols=['text', 'label', 'keyword'])

    paraphrased_3_df_1 = pd.read_csv(paraphrase_path + 'Paraphrase1/paraphrased_class_3.csv', usecols=['text', 'label', 'keyword'])
    paraphrased_3_df_2 = pd.read_csv(paraphrase_path + 'Paraphrase2/paraphrased_class_3.csv', usecols=['text', 'label', 'keyword'])
    paraphrased_3_df_3 = pd.read_csv(paraphrase_path + 'Paraphrase3/paraphrased_class_3.csv', usecols=['text', 'label', 'keyword'])
    paraphrased_3_df_4 = pd.read_csv(paraphrase_path + 'Paraphrase4/paraphrased_class_3.csv', usecols=['text', 'label', 'keyword'])

    paraphrased_df = pd.concat([paraphrased_1_df_1, paraphrased_1_df_2, paraphrased_1_df_3, paraphrased_2_df_1, paraphrased_3_df_1, paraphrased_3_df_2, paraphrased_3_df_3, paraphrased_3_df_4], ignore_index=True)

    # Add keywords to paraphrased dfs
    paraphrased_df = preprocessing.add_keywords(paraphrased_df, model_checkpoint)

    train_df = pd.concat([train_df, paraphrased_df], ignore_index=True)
    
if traditional_aug:
    punct_df = pd.read_csv(aug_path + 'punct_df.csv', usecols=['text', 'label', 'keyword'])
    # punct_df = punct_df.loc[punct_df['label'] != 0]
    # punct_df = punct_df.loc[punct_df['label'] != 2]

    rnd_del_df = pd.read_csv(aug_path + 'rnd_del_df.csv', usecols=['text', 'label', 'keyword'])
    # rnd_del_df = rnd_del_df.loc[rnd_del_df['label'] != 0]
    # rnd_del_df = rnd_del_df.loc[rnd_del_df['label'] != 2]

    rnd_swap_df = pd.read_csv(aug_path + 'rnd_swap_df.csv', usecols=['text', 'label', 'keyword'])
    # rnd_swap_df = rnd_swap_df.loc[rnd_swap_df['label'] != 0]
    # rnd_swap_df = rnd_swap_df.loc[rnd_swap_df['label'] != 2]

    rnd_insert_df = pd.read_csv(aug_path + 'rnd_insert_df.csv', usecols=['text', 'label', 'keyword'])
    # rnd_insert_df = rnd_insert_df.loc[rnd_insert_df['label'] != 0]
    # rnd_insert_df = rnd_insert_df.loc[rnd_insert_df['label'] != 2]

    aug_df = pd.concat([punct_df, rnd_del_df, rnd_swap_df, rnd_insert_df], ignore_index=True)

    # Add keywords to augmented dfs
    aug_df = preprocessing.add_keywords(aug_df, model_checkpoint)

    # merge df with paraphrased dfs
    # train_df = pd.concat([train_df, paraphrased_df, aug_df], ignore_index=True)
    train_df = pd.concat([train_df, aug_df], ignore_index=True)
        

Paraphrased 1 df:
                                                   text  label
0    It's the best way for me to get outside at nig...      1
1    Take a long walk in nature and write everythin...      1
2    If you take a walk, you can start in a place w...      1
3    I went outside for the first time in about 8 m...      1
4    It would allow me to do more hobbies like park...      1
..                                                 ...    ...
144  Even if I'm still here and I still feel the an...      1
145                     There is a <sep> Go for a walk      1
146  You will feel less anxious if you are outside ...      1
147  Do you know the word "fika" in English, coffee...      1
148  Take a walk in the park, small things like tha...      1

[149 rows x 2 columns]


## Cut Classes to X texts

In [12]:
if undersampling:
    # Size of each class after sampling (Hyperparameter)
    # Class 0 has 796 samples and was not augmented
    class_size = 1500

    # Sample 200 texts from each class (or as many as are available for classes with fewer than 200 examples)
    sampled_dfs = []
    for label in train_df['label'].unique():
        class_sample_size = min(len(train_df[train_df['label'] == label]), class_size)
        sampled_dfs.append(train_df[train_df['label'] == label].sample(n=class_sample_size, random_state=42))

    # Concatenate the samples to create a balanced training DataFrame
    train_df = pd.concat(sampled_dfs, ignore_index=True)

## Extract texts and labels from train_df

In [12]:
shuffled_train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
# Now you can extract the texts and labels
train_texts = shuffled_train_df['text']
print("Train texts balanced", train_texts)
# print datatype of y train values
y_train = shuffled_train_df['label']
print("Datatype of y_train", type(y_train))
print("y_train balanced", y_train)

Train texts balanced 0       yeah i was in your boat for 2 years in the end...
1       we exchanged numbers and pics proving we are r...
2       I have to take both of my dogs for a walk ever...
3       as part of my office job, you had to deal with...
4       i went to the gaeltacht (irish speaking summer...
                              ...                        
1773    table tennis, as soon as i start playing, my a...
1774    I realize that the inner critic has no power w...
1775    and an automated turret placed outside <sep> o...
1776    I don't like riding in cars that don't have ti...
1777    i got there at 8:55 and decided to just wait t...
Name: text, Length: 1778, dtype: object
Datatype of y_train <class 'pandas.core.series.Series'>
y_train balanced 0       0
1       2
2       3
3       0
4       2
       ..
1773    0
1774    1
1775    0
1776    3
1777    0
Name: label, Length: 1778, dtype: int64


## Print train_df class distribution after cutting/before augmentation

In [None]:
# Contar el número de publicaciones en cada categoría
class_counts = train_df['label'].value_counts()
print("Class distribution after cutting:\n", class_counts)

In [None]:
# import matplotlib.pyplot as plt

# df_plot = train_df.copy()

# label_mapping = {1: 'positive', 2: 'neutral', 3: 'negative', 0: 'unrelated'}
# df_plot['label'] = df_plot['label'].map(label_mapping)

# # Contar el número de publicaciones en cada categoría
# class_counts = df_plot['label'].value_counts()
# print(class_counts)

# # Crear un gráfico de barras
# plt.figure(figsize=(8, 6))
# class_counts.plot(kind='bar')
# plt.title('Distribución de clases')
# plt.xlabel('Clase')
# plt.ylabel('Número de publicaciones')
# plt.xticks(rotation=0)
# plt.show()

## Hyperparameters

In [None]:
hyperparameters = {
    'epochs': 100,
    'batch_size': 16,
    'weight_decay': 0.01,
    'learning_rate': 2e-6,
    'warmup_steps': 1000,
    'metric_for_best_model': "f1",
    'early_stopping_patience': 4,
    'max_length': 256,
}

## Run Model

In [None]:
from models import tune_transformer
# from models import tune_transformer_accelerate as tune_transformer

print("------------------------------------")
print("Model:", model_checkpoint)
print("------------------------------------")

print("Converting train, val and test texts to csv...")
train_texts.to_csv('data/train_texts.csv', index=False, header=False)
val_texts.to_csv('data/val_texts.csv', index=False, header=False)
test_texts.to_csv('data/test_texts.csv', index=False, header=False)

if deploying:
    test_pred_labels = tune_transformer.run(model_checkpoint, 4,
                                            train_texts, val_texts, test_texts,
                                            y_train, y_val, y_test=None,
                                            hyperparameters=hyperparameters)
    
    # replace original test labels with predicted labels
    df_test['label'] = test_pred_labels

    # save the dataframe with predicted labels to a csv file
    print("Saving predictions to csv...")
    df_test.to_csv('data/prediction_task3.tsv', sep='\t', index=False)
else:
    test_pred_labels = tune_transformer.run(model_checkpoint, 4,
                                            train_texts, val_texts, test_texts,
                                            y_train, y_val, y_test,
                                            hyperparameters=hyperparameters)

## Print End Time

In [None]:
print_time.print_("End-Time")