In [2]:
# Mount Google Drive
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import os
import csv
import numpy as np
import pandas as pd
import sys
import tqdm

In [4]:
from pickle import NONE
from sklearn.model_selection import train_test_split

# Data Path
root_dir = "gdrive/My Drive/CounterGEDI/"
GeDI_dir = os.path.join(root_dir, 'Datasets/Original/')


def politeness_set():
  Polite_ = os.path.join(GeDI_dir, 'politeness.tsv')
  save_dir = os.path.join(root_dir, 'Datasets/Politeness/')

  df_polite = pd.read_table(Polite_,sep='\t')
  
  tuple_head=[]
  for index,row in df_polite[df_polite['split']=='train'].iterrows():
    tuple_temp=[row['txt']]
    if(row['style']=='P_9'):
      tuple_temp.append('polite')
    else:
      tuple_temp.append('non_polite')
    tuple_head.append(tuple_temp)
  df_polite_train=pd.DataFrame(tuple_head,columns=['text','labels'])

  tuple_head=[]
  for index,row in df_polite[df_polite['split']=='val'].iterrows():
    tuple_temp=[row['txt']]
    if(row['style']=='P_9'):
      tuple_temp.append('polite')
    else:
      tuple_temp.append('non_polite')
    tuple_head.append(tuple_temp)
  df_polite_val=pd.DataFrame(tuple_head,columns=['text','labels'])
  
  tuple_head=[]
  for index,row in df_polite[df_polite['split']=='test'].iterrows():
    tuple_temp=[row['txt']]
    if(row['style']=='P_9'):
      tuple_temp.append('polite')
    else:
      tuple_temp.append('non_polite')
    tuple_head.append(tuple_temp)
  df_polite_test=pd.DataFrame(tuple_head,columns=['text','labels'])
  
  os.makedirs(save_dir, exist_ok=True)
  save_data(save_dir, df_polite_train, df_polite_val, df_polite_test)

def detox_set():
  Detox_ = os.path.join(GeDI_dir, 'detox/train.csv')
  Detox_test = os.path.join(GeDI_dir, 'detox/test.csv')
  Detox_testlabel = os.path.join(GeDI_dir, 'detox/test_labels.csv')
  save_dir = os.path.join(root_dir, 'Datasets/Toxicity/')

  # read df
  df = pd.read_csv(Detox_)
  test = pd.read_csv(Detox_test)
  labels = pd.read_csv(Detox_testlabel)

  # merge test set with test labels
  df_test = test.merge(labels, left_on='id', right_on='id')

  def format(df):
    # As the author mentioned, we seperate normal text and toxic text
    tuple_head=[]
    for index,row in df.iterrows():
      tuple_temp=[row['id'], row['comment_text']]
      flag=0
      for ele in list(df.columns[2:]):
        if(row[ele]==1):
          tuple_temp.append('toxic')
          flag=1
          break
      if(flag==0):
        tuple_temp.append('non_toxic')
      tuple_head.append(tuple_temp)
    return pd.DataFrame(tuple_head,columns=['id','text','labels'])

  df = format(df)
  test = format(df_test)
  
  # Split dataset (We stratified-split the released training dataset randomly into 90% training and 10% validation sets)
  train, val = train_test_split(df, stratify=df['labels'], test_size=0.1)

  os.makedirs(save_dir, exist_ok=True)
  save_data(save_dir, train, val, test)

def emo_set():
  Emo_ = os.path.join(GeDI_dir, 'emotion.pkl')
  df = pd.read_pickle(Emo_)

  # drop samples with love OR surprise as stated in original paper
  df = df[(df['emotions'] == 'sadness') | (df['emotions'] == 'joy') | (df['emotions'] == 'anger') | (df['emotions'] == 'fear')]

  # rename the emotion as labels
  df = df.rename(columns={'emotions': 'labels'})
  
  # Split dataset (We stratified-split each dataset randomly into training, validation, andtest set with 80%fortraining, and 10%forbothvalidation and testing.)
  train, val_test = train_test_split(df, stratify=df['labels'], test_size=0.2)
  val, test = train_test_split(val_test, stratify=val_test['labels'], test_size=0.5)
  
  save_dir = os.path.join(root_dir, 'Datasets/Emotion/')

  os.makedirs(save_dir, exist_ok=True)
  save_data(save_dir, train, val, test)
  
  emotions = df["labels"].unique()
  for emo in emotions:
    # As the author mentioned, we generate four different datasets with each attribute considered as positive and the others negative
    train['labels'] = train['labels'].apply(lambda x: emo if x == emo else 'other')
    val['labels'] = val['labels'].apply(lambda x: emo if x == emo else 'other')
    test['labels'] = test['labels'].apply(lambda x: emo if x == emo else 'other')
    save_dir = os.path.join(root_dir, 'Datasets/Emotion_' + emo + '/')

    os.makedirs(save_dir, exist_ok=True)
    save_data(save_dir, train, val, test)


def save_data(save_dir, train, val, test=None):
  train_ = os.path.join(save_dir, 'Train.csv')
  val_ = os.path.join(save_dir, 'Val.csv')
  train.to_csv(train_) 
  val.to_csv(val_) 
  if test is not None:
    test_ = os.path.join(save_dir, 'Test.csv') 
    test.to_csv(test_) 



In [5]:
politeness_set()
# detox_set()
# emo_set()