In [4]:
import pandas as pd 
import numpy as np
import re
import time
import nltk
from nltk.tokenize import word_tokenize
import pickle
from nltk.corpus import wordnet
import spacy
from spacy.lang.en import English
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
negation_words = ["no", "not", "n't","isnt", "wasnt", "nor", "none", "aint"]
filler = ["be", "is", "a", "an", "-pron- "]

In [5]:
fields = ['comment_text', 'toxicity','target']

train_df = pd.read_csv("data/pre_processed/train_lemma_nopunct_cleaned_sentencized.csv", usecols = fields)
valid_df = pd.read_csv("data/pre_processed/valid_lemma_nopunct_cleaned_sentencized.csv", usecols = fields)

In [8]:
dfs = [train_df, valid_df]
df_names = ["train", "valid"]
suffix = "negations_fliped"
path = "experimental/"

for i, df in enumerate(dfs):
    print(df_names[i])
    t1 = time.time()
    processed_comments = []

    for k, text in enumerate(df['comment_text']):
        if k % 10000 == 0:
            print("{0:.0%}...".format(k/len(dfs[i])), end='')
        tokens = word_tokenize(text)
        for j, token in enumerate(tokens):
            tokenlen = len(tokens)
            if (j > 0 and j < (tokenlen - 1) and tokens[j - 1] in negation_words):
                if (tokens[j] not in filler):
                    tokens[j] = "NEG_" + tokens[j]
                    del tokens[j - 1]
                else:
                    if (j < (tokenlen - 2)):
                        tokens[j + 1] = "NEG_" + tokens[j + 1]
                        del tokens[j - 1]

        processed_comments.append(" ".join(tokens))

    df['comment_text'] = processed_comments
    df.to_csv(path + df_names[i]  + "_" + suffix + ".csv", index = False)


train
0%...1%...1%...2%...3%...3%...4%...5%...6%...6%...7%...8%...8%...9%...10%...10%...11%...12%...12%...13%...14%...15%...15%...16%...17%...17%...18%...19%...19%...20%...21%...21%...22%...23%...24%...24%...25%...26%...26%...27%...28%...28%...29%...30%...30%...31%...32%...33%...33%...34%...35%...35%...36%...37%...37%...38%...39%...39%...40%...41%...42%...42%...43%...44%...44%...45%...46%...46%...47%...48%...48%...49%...50%...51%...51%...52%...53%...53%...54%...55%...55%...56%...57%...57%...58%...59%...60%...60%...61%...62%...62%...63%...64%...64%...65%...66%...66%...67%...68%...69%...69%...70%...71%...71%...72%...73%...73%...74%...75%...76%...76%...77%...78%...78%...79%...80%...80%...81%...82%...82%...83%...84%...85%...85%...86%...87%...87%...88%...89%...89%...90%...91%...91%...92%...93%...94%...94%...95%...96%...96%...97%...98%...98%...99%...100%...valid
0%...3%...6%...8%...11%...14%...17%...19%...22%...25%...28%...30%...33%...36%...39%...42%...44%...47%...50%...53%...55%...58%...61%

In [6]:
#### Alternative Strategy: Add NEG_ to all words after negation until a "." is encountered ####
if (1 == 0):
    dfs = [train_df, valid_df]
    df_names = ["train", "valid"]
    suffix = "negations_fliped"
    path = "experimental/"

    for i, df in enumerate(dfs):
        print(df_names[i])
        t1 = time.time()
        processed_comments = []

        pipe = nlp.pipe(df['comment_text'], batch_size = 512 , disable = ["ner", "tagger"])
        results = []
        for j, doc in enumerate(pipe):
            if k % 10000 == 0:
                print("{0:.0%}...".format(k/len(dfs[i])), end='')

            processed_comments.append(" ".join(tokens))

        df['comment_text'] = processed_comments
        df.to_csv(path + df_names[i]  + "_" + suffix + ".csv", index = False)


train
0%...3%...6%...8%...11%...14%...17%...19%...22%...25%...28%...30%...33%...36%...39%...42%...44%...47%...50%...53%...55%...58%...61%...64%...66%...69%...72%...75%...78%...80%...83%...86%...89%...91%...94%...97%...100%...valid
0%...11%...22%...33%...44%...55%...66%...78%...89%...100%...

In [None]:
#For experimenting with changing words to their negations (good -> bad) instead of just adding NEG_

#returns an antonym if one can be found, None otherwise
def find_antonym(word):
    antonyms = []
    
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            if l.antonyms(): 
                antonyms.append(l.antonyms()[0].name())
    if (len(antonyms) > 0):
        return antonyms[0]
    else:
        return None
    
if (1 == 0):
    dfs = [train_df, valid_df]
    df_names = ["train", "valid"]
    suffix = "negations_fliped"
    path = "experimental/"

    for i, df in enumerate(dfs):
        print(df_names[i])
        t1 = time.time()
        processed_comments = []

        for k, text in enumerate(df['comment_text']):
            if k % 10000 == 0:
                print("{0:.0%}...".format(k/len(dfs[i])), end='')
            tokens = word_tokenize(text)
            for j, token in enumerate(tokens):
                tokenlen = len(tokens)
                if (j > 0 and j < (tokenlen - 1) and tokens[j - 1] in negation_words):
                    antonym = find_antonym(token) 
                    if(antonym != None):
                        tokens[j] = antonym
                        del tokens[j - 1]
            processed_comments.append(" ".join(tokens))

        df['comment_text'] = processed_comments
        df.to_csv(path + df_names[i]  + "_" + suffix + ".csv", index = False)


In [None]:
#Tests with dependency parser

pipe = nlp.pipe(["He is a complete idiot", \
                 "Do you not think you are an idiot?", \
                 "Do not be mean", "He is not an idiot, actually he is very smart!"], batch_size = 512 , disable = ["ner", "tagger"])
results = []
for j, doc in enumerate(pipe):
    negation_tokens = [tok for tok in doc if tok.dep_ == 'neg']
    negation_head_tokens = [token.head for token in negation_tokens]
    
    print(" ".join([("NEG_" + tok.text) if (tok in negation_head_tokens) else tok.text for tok in doc]))
   
doc = nlp.pipe(["He is not an idiot"])
options = {'compact': True, 'color': 'black', 'font': 'Arial'}
displacy.serve(doc, style='dep', options=options)