In [1]:
import pandas as pd 
import numpy as np
import re
import time
import nltk
from nltk.tokenize import word_tokenize
import pickle
from nltk.corpus import wordnet

negation_words = ["no", "not", "n't"]

In [2]:
fields = ['comment_text', 'toxicity','target']

train_df = pd.read_csv("data/pre_processed/subsets/25_percent/train_lemma_nopunct_cleaned_sentencized.csv", usecols = fields)
train_df.dropna(inplace = True)
valid_df = pd.read_csv("data/pre_processed/subsets/25_percent/valid_lemma_nopunct_cleaned_sentencized.csv", usecols = fields)
valid_df.dropna(inplace = True)

In [3]:
#returns an antonym if one can be found, None otherwise
def find_antonym(word):
    antonyms = []
    
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            if l.antonyms(): 
                antonyms.append(l.antonyms()[0].name())
    if (len(antonyms) > 0):
        return antonyms[0]
    else:
        return None

In [4]:
if (1 == 0):
    dfs = [train_df, valid_df]
    df_names = ["train", "valid"]
    suffix = "negations_fliped"
    path = "experimental/"

    for i, df in enumerate(dfs):
        print(df_names[i])
        t1 = time.time()
        processed_comments = []

        for k, text in enumerate(df['comment_text']):
            if k % 10000 == 0:
                print("{0:.0%}...".format(k/len(dfs[i])), end='')
            tokens = word_tokenize(text)
            for j, token in enumerate(tokens):
                tokenlen = len(tokens)
                if (j > 0 and j < (tokenlen - 1) and tokens[j - 1] in negation_words):
                    antonym = find_antonym(token) 
                    if(antonym != None):
                        tokens[j] = antonym
                        del tokens[j - 1]
            processed_comments.append(" ".join(tokens))

        df['comment_text'] = processed_comments
        df.to_csv(path + df_names[i]  + "_" + suffix + ".csv", index = False)


In [5]:
#### Alternative Strategy: Just add NEG_ to every negated word instead of replacing it with the opposite ####
if (1 == 0):
    dfs = [train_df, valid_df]
    df_names = ["train", "valid"]
    suffix = "negations_fliped"
    path = "experimental/"

    for i, df in enumerate(dfs):
        print(df_names[i])
        t1 = time.time()
        processed_comments = []

        for k, text in enumerate(df['comment_text']):
            if k % 10000 == 0:
                print("{0:.0%}...".format(k/len(dfs[i])), end='')
            tokens = word_tokenize(text)
            for j, token in enumerate(tokens):
                tokenlen = len(tokens)
                if (j > 0 and j < (tokenlen - 1) and tokens[j - 1] in negation_words):
                    tokens[j] = "NEG_" + tokens[j]
                    del tokens[j - 1]
            processed_comments.append(" ".join(tokens))

        df['comment_text'] = processed_comments
        df.to_csv(path + df_names[i]  + "_" + suffix + ".csv", index = False)


In [6]:
#### Other Alternative Strategy: Add NEG_ to all words after negation until a "." is encountered ####

dfs = [train_df, valid_df]
df_names = ["train", "valid"]
suffix = "negations_fliped"
path = "experimental/"

for i, df in enumerate(dfs):
    print(df_names[i])
    t1 = time.time()
    processed_comments = []
    
    for k, text in enumerate(df['comment_text']):
        if k % 10000 == 0:
            print("{0:.0%}...".format(k/len(dfs[i])), end='')
        tokens = word_tokenize(text)
        for j, token in enumerate(tokens):
            tokenlen = len(tokens)
            if (j > 0 and j < (tokenlen - 1) and tokens[j - 1] in negation_words):
                del tokens[j -1]
                while(tokens[j] != "." and j < tokenlen):
                    tokens[j] = "NEG_" + tokens[j]
                    j += 1
        processed_comments.append(" ".join(tokens))
        
    df['comment_text'] = processed_comments
    df.to_csv(path + df_names[i]  + "_" + suffix + ".csv", index = False)


train
0%...3%...6%...8%...11%...14%...17%...19%...22%...25%...28%...30%...33%...36%...39%...42%...44%...47%...50%...53%...55%...58%...61%...64%...66%...69%...72%...75%...78%...80%...83%...86%...89%...91%...94%...97%...100%...valid
0%...11%...22%...33%...44%...55%...66%...78%...89%...100%...