In [271]:
import nltk
import numpy as np
import pandas as pd
import os 
import sys

In [272]:
# This cell installs twython to jupyter notebook kernel and updates NLTK if it's outdated
#!{sys.executable} -m pip install nltk -U --user

In [273]:
'''
Authors: Jacob Enoch, Austin John
Class: CMSC473 - Natural Language Processing
Project: Discord Moderation via RNN Model
'''

'\nAuthors: Jacob Enoch, Austin John\nClass: CMSC473 - Natural Language Processing\nProject: Discord Moderation via RNN Model\n'

In [274]:
toxicity_text_df = pd.read_csv('Wikipedia Toxicity Data/toxicity_annotated_comments.tsv', sep="\t")

In [275]:
toxicity_text_df

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split
0,2232.0,This:NEWLINE_TOKEN:One can make an analogy in ...,2002,True,article,random,train
1,4216.0,`NEWLINE_TOKENNEWLINE_TOKEN:Clarification for ...,2002,True,user,random,train
2,8953.0,Elected or Electoral? JHK,2002,False,article,random,test
3,26547.0,`This is such a fun entry. DevotchkaNEWLINE_...,2002,True,article,random,train
4,28959.0,Please relate the ozone hole to increases in c...,2002,True,article,random,test
...,...,...,...,...,...,...,...
159681,699848324.0,`NEWLINE_TOKENNEWLINE_TOKENNEWLINE_TOKENThese ...,2016,True,article,blocked,train
159682,699851288.0,NEWLINE_TOKENNEWLINE_TOKENThe Institute for Hi...,2016,True,article,blocked,test
159683,699857133.0,NEWLINE_TOKEN:The way you're trying to describ...,2016,True,article,blocked,dev
159684,699891012.0,NEWLINE_TOKENNEWLINE_TOKEN== Warning ==NEWLINE...,2016,True,user,blocked,train


In [276]:
# This cell removes all NEWLINE_TOKEN and TAB_TOKEN occurences with a space
toxicity_text_df['comment'] = toxicity_text_df['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
toxicity_text_df['comment'] = toxicity_text_df['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))

In [277]:
toxicity_text_df

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split
0,2232.0,This: :One can make an analogy in mathematical...,2002,True,article,random,train
1,4216.0,` :Clarification for you (and Zundark's righ...,2002,True,user,random,train
2,8953.0,Elected or Electoral? JHK,2002,False,article,random,test
3,26547.0,`This is such a fun entry. Devotchka I once...,2002,True,article,random,train
4,28959.0,Please relate the ozone hole to increases in c...,2002,True,article,random,test
...,...,...,...,...,...,...,...
159681,699848324.0,` These sources don't exactly exude a sense ...,2016,True,article,blocked,train
159682,699851288.0,The Institute for Historical Review is a pee...,2016,True,article,blocked,test
159683,699857133.0,:The way you're trying to describe it in this...,2016,True,article,blocked,dev
159684,699891012.0,== Warning == There is clearly a protection...,2016,True,user,blocked,train


In [278]:
toxicity_text_annotations_df = pd.read_csv('Wikipedia Toxicity Data/toxicity_annotations.tsv', sep="\t")

In [279]:
toxicity_text_annotations_df

Unnamed: 0,rev_id,worker_id,toxicity,toxicity_score
0,2232.0,723,0,0.0
1,2232.0,4000,0,0.0
2,2232.0,3989,0,1.0
3,2232.0,3341,0,0.0
4,2232.0,1574,0,1.0
...,...,...,...,...
1598284,699897151.0,1550,0,0.0
1598285,699897151.0,1025,0,1.0
1598286,699897151.0,648,0,1.0
1598287,699897151.0,379,0,0.0


In [280]:
# labels a comment as toxic if at least half of the annotators said that a comment was toxic
labels = toxicity_text_annotations_df.groupby('rev_id')['toxicity'].mean() >= 0.5

In [281]:
toxicity_text_df['toxicity'] = list(labels)
toxicity_text_df

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,toxicity
0,2232.0,This: :One can make an analogy in mathematical...,2002,True,article,random,train,False
1,4216.0,` :Clarification for you (and Zundark's righ...,2002,True,user,random,train,False
2,8953.0,Elected or Electoral? JHK,2002,False,article,random,test,False
3,26547.0,`This is such a fun entry. Devotchka I once...,2002,True,article,random,train,False
4,28959.0,Please relate the ozone hole to increases in c...,2002,True,article,random,test,False
...,...,...,...,...,...,...,...,...
159681,699848324.0,` These sources don't exactly exude a sense ...,2016,True,article,blocked,train,False
159682,699851288.0,The Institute for Historical Review is a pee...,2016,True,article,blocked,test,False
159683,699857133.0,:The way you're trying to describe it in this...,2016,True,article,blocked,dev,False
159684,699891012.0,== Warning == There is clearly a protection...,2016,True,user,blocked,train,False


In [282]:
# Converts labels from True/False to 1/0 respectively - use with df.apply on specifc column
def clean_labels(val):
    
    if val == False:
        val = 0
    else:
        val = 1
    
    return val

In [284]:
toxicity_text_df['toxicity'] = toxicity_text_df['toxicity'].apply(clean_labels)
toxicity_text_df

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,toxicity
0,2232.0,This: :One can make an analogy in mathematical...,2002,True,article,random,train,0
1,4216.0,` :Clarification for you (and Zundark's righ...,2002,True,user,random,train,0
2,8953.0,Elected or Electoral? JHK,2002,False,article,random,test,0
3,26547.0,`This is such a fun entry. Devotchka I once...,2002,True,article,random,train,0
4,28959.0,Please relate the ozone hole to increases in c...,2002,True,article,random,test,0
...,...,...,...,...,...,...,...,...
159681,699848324.0,` These sources don't exactly exude a sense ...,2016,True,article,blocked,train,0
159682,699851288.0,The Institute for Historical Review is a pee...,2016,True,article,blocked,test,0
159683,699857133.0,:The way you're trying to describe it in this...,2016,True,article,blocked,dev,0
159684,699891012.0,== Warning == There is clearly a protection...,2016,True,user,blocked,train,0
