In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

In [16]:
augmented_datasets_dir = '../augmented_datasets/'

In [17]:
original_df = pd.read_csv(os.path.join(augmented_datasets_dir, 'modified_goldstandard.csv'))


In [18]:
original_train_df, test_df = train_test_split(original_df, test_size=0.2, random_state=42)
original_train_df = original_train_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [23]:
def modify_df_columns(df):
    df = df.rename(columns={'INSTANCE_COLUMN': 'text', 'LABEL_COLUMN': 'labels'})
    df['inst_id'] = range(1, len(df) + 1)
    # Ensure 'inst_id' is the first column followed by 'text' and 'labels'
    return df[['inst_id', 'text', 'labels']]

In [20]:
test_df = modify_df_columns(test_df)
# test_df.to_csv(os.path.join(augmented_datasets_dir, 'test_set.csv'), index=False)
display(test_df)

Unnamed: 0,text,labels
0,You take things that don't belong to you.,ABUSE
1,I take it you don't want to be characterised i...,NONE ABUSE
2,You are so imaginative at what could go wrong.,NONE ABUSE
3,You need to act your age.,ABUSE
4,I think you might be challenged vertically.,ABUSE
...,...,...
355,"You really carry on talking, regardless of the...",NONE ABUSE
356,You are the reason I have decided to spend mor...,NONE ABUSE
357,You have always given me amazing vibes.,NONE ABUSE
358,It's great that no one notices you when you sh...,NONE ABUSE


In [21]:
original_train_df = modify_df_columns(original_train_df)
# original_train_df.to_csv(os.path.join(augmented_datasets_dir, 'original_train_set.csv'), index=False)
display(original_train_df)

Unnamed: 0,text,labels
0,You seem to not take plans with your friends v...,NONE ABUSE
1,You are extremely provocative when we are disc...,NONE ABUSE
2,You would make better bread if you put in more...,NONE ABUSE
3,Your ability to ignore the negativity is aston...,NONE ABUSE
4,Your stinginess is not your greatest attribute.,NONE ABUSE
...,...,...
1432,You'll let anyone between your legs.,ABUSE
1433,How wonderful that you share your interests wi...,NONE ABUSE
1434,Your commitment to family is praiseworthy.,NONE ABUSE
1435,You could benefit from being more organised.,NONE ABUSE


In [24]:
for filename in ['flant5_large_augmented_data.csv', 'flant5_xxl_augmented_data.csv', 'gpt4_augmented_data.csv']:
    aug_df = pd.read_csv(os.path.join(augmented_datasets_dir, filename))
    
    # Modify columns of augmented data before concatenation
    aug_df = modify_df_columns(aug_df)
    
    # Combine with the original train set
    aug_train_df = pd.concat([original_train_df[['text', 'labels']], aug_df[['text', 'labels']]], ignore_index=True)
    aug_train_df = aug_train_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Add 'inst_id' column to the combined DataFrame at the first position
    aug_train_df.insert(0, 'inst_id', range(1, len(aug_train_df) + 1))
    
    # Save the combined train set
    new_filename = filename.replace('_augmented_data.csv', '_aug_train.csv')
    aug_train_df.to_csv(os.path.join(augmented_datasets_dir, new_filename), index=False)