This notebook trains a LDA topic model on the Wikipedia dataset annotated for personal attack. Then uses this model to find the distribution of topics in two other datasets Waseem and Founta. 

#Installations and preparing the environment

In [None]:

!pip install --upgrade gensim  #gensim-3.8.3
!pip install pyLDAvis  #Successfully installed funcy-1.14 pyLDAvis-2.1.2

In [None]:
!pip install tweet-preprocessor  #tweet-preprocessor-0.6.0

In [None]:
import pandas as pd
import nltk
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import gensim.corpora as corpora
from nltk.stem import WordNetLemmatizer, SnowballStemmer

from gensim.models.coherencemodel import CoherenceModel
from gensim import similarities

import numpy as np
nltk.download('stopwords') 
from nltk.corpus import stopwords

import os.path
import re
import glob
nltk.download('wordnet')

In [None]:
cd "YOUR-Current-Directory" #/content/drive/My Drive/Colab_Notebooks/toxicity/wiki-lda-share/' 

#Reading and Preparing the Wikipedia Dataset

In [None]:
comments = pd.read_csv('Wiki/toxicity_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('Wiki/toxicity_annotations.tsv',  sep = '\t')

In [None]:
# join labels and comments
comments['toxicity'] = annotations.groupby('rev_id')['toxicity'].mean() > 0.5

In [None]:
# remove newline and tab tokens
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x:re.sub(r'[^A-Za-z0-9 ]+', ' ', x).lower())

In [None]:
comments.head()

Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split,toxicity
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2232.0,this one can make an analogy in mathematical...,2002,True,article,random,train,False
4216.0,clarification for you and zundark s righ...,2002,True,user,random,train,False
8953.0,elected or electoral jhk,2002,False,article,random,test,False
26547.0,this is such a fun entry devotchka i once...,2002,True,article,random,train,False
28959.0,please relate the ozone hole to increases in c...,2002,True,article,random,test,False


In [None]:
comments.groupby('toxicity').count()

Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split
toxicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
False,144324,144324,144324,144324,144324,144324
True,15362,15362,15362,15362,15362,15362


#Training wiki-lda

In [None]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))  

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text, deacc=True):  # deacc=True removes punctuations
        if token not in STOPWORDS: #and if len(token)>3
            result.append(lemmatize_stemming(token))
    return result

In [None]:
lemmatized_data = comments['comment'].apply(preprocess).values.tolist()
texts_toxicity = comments['toxicity'].values.tolist()
# Create Dictionary
id2word = corpora.Dictionary(lemmatized_data)

# Create Corpus
texts= lemmatized_data

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]


In [None]:
num_topics=20
wiki_lda = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics, alpha='asymmetric', random_state= 100)

In [None]:
coherencemodel = CoherenceModel(model=wiki_lda, texts=texts, dictionary=id2word, coherence='c_v')
print (coherencemodel.get_coherence())

0.5606252998138611


#Display topics and basic statistics

In [None]:
number_of_toxic = len([item for item in texts_toxicity if item==True])
topics = {}
topics_list = []
topic_prob_list = []
for text_id in range(len(corpus)):
  this_comment_topics = wiki_lda[corpus[text_id]]
  this_comment_topics.sort(key=lambda x:x[1])
  t = this_comment_topics[-1]
  topic_index = t[0]
  topic_prob = t[1]
  topics_list.append(topic_index)
  topic_prob_list.append(topic_prob)
  if topics.get(topic_index, None) is None:
    topics[topic_index] = [text_id]
  else:
    topics[topic_index].append(text_id)
for idx in range(num_topics):
  print('Topic: {} \nWords: {}'.format(idx, wiki_lda.print_topic(idx)))
  print('%d documents  - %.3f of all the documnets' % (len(topics[idx]), len(topics[idx])/len(texts)))
  positives = [v for v in topics[idx] if texts_toxicity[v] ]
  print('%.2f percent toxic and %.2f of all the toxics' % (len(positives)/len(topics[idx]), len(positives)/number_of_toxic))

Topic: 0 
Words: 0.025*"know" + 0.019*"thank" + 0.019*"like" + 0.015*"think" + 0.014*"want" + 0.013*"look" + 0.013*"ll" + 0.012*"ve" + 0.012*"hi" + 0.012*"time"
31411 documents  - 0.197 of all the documnets
0.18 percent toxic and 0.37 of all the toxics
Topic: 1 
Words: 0.012*"time" + 0.010*"like" + 0.009*"peopl" + 0.009*"think" + 0.007*"year" + 0.006*"life" + 0.006*"day" + 0.006*"right" + 0.006*"drink" + 0.006*"million"
9306 documents  - 0.058 of all the documnets
0.17 percent toxic and 0.10 of all the toxics
Topic: 2 
Words: 0.019*"suck" + 0.015*"year" + 0.010*"citi" + 0.009*"new" + 0.009*"school" + 0.008*"cock" + 0.008*"old" + 0.008*"pussi" + 0.007*"dick" + 0.007*"women"
6282 documents  - 0.039 of all the documnets
0.18 percent toxic and 0.07 of all the toxics
Topic: 3 
Words: 0.047*"redirect" + 0.039*"talk" + 0.036*"utc" + 0.035*"categori" + 0.031*"film" + 0.016*"episod" + 0.013*"merg" + 0.012*"articl" + 0.011*"octob" + 0.011*"charact"
3175 documents  - 0.020 of all the documnets
0.

#Write the dataset with assigned topics and probabilities to a csv file

In [None]:
comments['wiki_topic'] = topics_list
comments['wiki_topic_prob'] = topic_prob_list

In [None]:
comments.to_csv('wiki_lda_topics_lda_probabilities.csv')

In [None]:
comments.head()

Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split,toxicity,wiki_topic,wiki_topic_prob
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2232.0,this one can make an analogy in mathematical...,2002,True,article,random,train,False,15,0.64184
4216.0,clarification for you and zundark s righ...,2002,True,user,random,train,False,15,0.299566
8953.0,elected or electoral jhk,2002,False,article,random,test,False,13,0.677198
26547.0,this is such a fun entry devotchka i once...,2002,True,article,random,train,False,7,0.314508
28959.0,please relate the ozone hole to increases in c...,2002,True,article,random,test,False,15,0.348049


#Topic Categories

In [None]:
topic_categories={1:[0,1],
                  2:[2,7,8,9,12,14,16],
                  3:[3,4,5,6,10,11,13,15,17,18,19]}

In [None]:
len(comments[comments['wiki_topic'].isin(topic_categories[3]) ])/len(comments)

0.5446939619002292

In [None]:
len(comments[comments['wiki_topic'].isin(topic_categories[3]) & comments['toxicity']])/len(comments[comments['toxicity']])

0.17881786225751856

In [None]:
print(comments[comments['split']=='train']['comment'].values.tolist()[topics[9][5]])


In [None]:
wiki_lda[corpus[topics[9][5]]]

#Reading Founta Dataset and labeling topics 


In [None]:
Founta_df = pd.read_excel('Founta/hatespeech_text_label_vote.xlsx',header=None)

In [None]:
Founta_df.rename(columns={0: 'comment',1:'label',2:'vote'},inplace = True)

In [None]:
Founta_df=Founta_df[Founta_df['label']!='spam']

In [None]:
Founta_df['toxicity'] = Founta_df['label'].apply(lambda x: 0 if x=='normal' else 1)

In [None]:
lemmatized_data_Founta = Founta_df['comment'].apply(preprocess).values.tolist()


In [None]:
texts_toxicity_Founta = Founta_df['toxicity'].values.tolist()


# Term Document Frequency
corpus_Founta = [id2word.doc2bow(text) for text in lemmatized_data_Founta]

In [None]:
topic_prob_list = []
topics_list = []
topics_Founta = {}
for text_id in range(len(corpus_Founta)):
  this_comment_topics = wiki_lda[corpus_Founta[text_id]]  #### TODO : check sanity 
  this_comment_topics.sort(key=lambda x:x[1])
  t = this_comment_topics[-1]
  topic_index = t[0]
  topic_prob = t[1]
  topics_list.append(topic_index)
  topic_prob_list.append(topic_prob)
  if topics_Founta.get(topic_index, None) is None:
    topics_Founta[topic_index] = [text_id]
  else:
    topics_Founta[topic_index].append(text_id)
for idx in range(num_topics):
  print('Topic: {} '.format(idx))
  print('%d documents  - %.3f of all the documnets' % (len(topics_Founta[idx]), len(topics_Founta[idx])/len(lemmatized_data_Founta)))
  toxic = [v for v in topics_Founta[idx] if texts_toxicity_Founta[v]==1 ]
  print('%.2f percent toxic' % (len(toxic)/len(topics_Founta[idx])))

Topic: 0 
28363 documents  - 0.330 of all the documnets
0.45 percent toxic
Topic: 1 
17557 documents  - 0.204 of all the documnets
0.32 percent toxic
Topic: 2 
10698 documents  - 0.124 of all the documnets
0.26 percent toxic
Topic: 3 
770 documents  - 0.009 of all the documnets
0.34 percent toxic
Topic: 4 
1172 documents  - 0.014 of all the documnets
0.18 percent toxic
Topic: 5 
1422 documents  - 0.017 of all the documnets
0.20 percent toxic
Topic: 6 
3447 documents  - 0.040 of all the documnets
0.20 percent toxic
Topic: 7 
2673 documents  - 0.031 of all the documnets
0.30 percent toxic
Topic: 8 
883 documents  - 0.010 of all the documnets
0.57 percent toxic
Topic: 9 
1368 documents  - 0.016 of all the documnets
0.42 percent toxic
Topic: 10 
252 documents  - 0.003 of all the documnets
0.27 percent toxic
Topic: 11 
1408 documents  - 0.016 of all the documnets
0.25 percent toxic
Topic: 12 
5400 documents  - 0.063 of all the documnets
0.41 percent toxic
Topic: 13 
1344 documents  - 0.016 

In [None]:
Founta_df['wiki_topic'] = topics_list
Founta_df['wiki_topic_prob'] = topic_prob_list

In [None]:
Founta_df.to_csv('Founta_wiki_lda_topics_lda_probabilities.csv')

#Reading Waseem Dataset and Labeling Topics 

In [None]:
sexism = pd.read_json('Waseem/sexism.json',lines=True)
racism = pd.read_json('Waseem/racism.json',lines=True)
neither = pd.read_json('Waseem/neither.json',lines=True)


In [None]:
frames = [sexism, racism,neither]
waseem_df = pd.concat(frames).reset_index()

In [None]:
waseem_df.head()

In [None]:
import preprocessor as p

waseem_df['comment'] = waseem_df['text'].apply(str).apply(p.clean).apply(lambda x:re.sub(r'[^A-Za-z0-9 ]+', ' ', x).lower())
waseem_df['toxicity'] = waseem_df['Annotation'].apply(lambda x:False if x=='none' else True)


In [None]:
lemmatized_data_waseem = waseem_df['comment'].apply(preprocess).values.tolist()
texts_toxicity_waseem = waseem_df['Annotation'].values.tolist()


# Term Document Frequency
corpus_waseem = [id2word.doc2bow(text) for text in lemmatized_data_waseem]

In [None]:
#number_of_toxic_waseem = len([item for item in texts_toxicity_waseem if item ])

In [None]:
list(set(texts_toxicity_waseem))

['sexism', 'racism', 'none']

In [None]:
topic_prob_list = []
topics_list = []
topics_waseem = {}
for text_id in range(len(corpus_waseem)):
  this_comment_topics = wiki_lda[corpus_waseem[text_id]]  #### TODO : check sanity 
  this_comment_topics.sort(key=lambda x:x[1])
  t = this_comment_topics[-1]
  topic_index = t[0]
  topic_prob = t[1]
  topics_list.append(topic_index)
  topic_prob_list.append(topic_prob)
  if topics_waseem.get(topic_index, None) is None:
    topics_waseem[topic_index] = [text_id]
  else:
    topics_waseem[topic_index].append(text_id)
for idx in range(num_topics):
  print('Topic: {} '.format(idx))
  print('%d documents  - %.3f of all the documnets' % (len(topics_waseem[idx]), len(topics_waseem[idx])/len(lemmatized_data_waseem)))
  racism = [v for v in topics_waseem[idx] if texts_toxicity_waseem[v]=='racism' ]
  sexism = [v for v in topics_waseem[idx] if texts_toxicity_waseem[v]=='sexism' ]
  print('%.2f percent racist and %.2f percent sexist' % (len(racism)/len(topics_waseem[idx]), len(sexism)/len(topics_waseem[idx])))

Topic: 0 
5618 documents  - 0.332 of all the documnets
0.03 percent racist and 0.20 percent sexist
Topic: 1 
2985 documents  - 0.177 of all the documnets
0.10 percent racist and 0.20 percent sexist
Topic: 2 
1682 documents  - 0.099 of all the documnets
0.11 percent racist and 0.35 percent sexist
Topic: 3 
104 documents  - 0.006 of all the documnets
0.01 percent racist and 0.31 percent sexist
Topic: 4 
291 documents  - 0.017 of all the documnets
0.05 percent racist and 0.26 percent sexist
Topic: 5 
411 documents  - 0.024 of all the documnets
0.11 percent racist and 0.18 percent sexist
Topic: 6 
415 documents  - 0.025 of all the documnets
0.03 percent racist and 0.18 percent sexist
Topic: 7 
641 documents  - 0.038 of all the documnets
0.26 percent racist and 0.13 percent sexist
Topic: 8 
1060 documents  - 0.063 of all the documnets
0.61 percent racist and 0.02 percent sexist
Topic: 9 
257 documents  - 0.015 of all the documnets
0.29 percent racist and 0.26 percent sexist
Topic: 10 
68 do

In [None]:
waseem_df['wiki_topic'] = topics_list
waseem_df['wiki_topic_prob'] = topic_prob_list

In [None]:
waseem_df.sample(n = 20)

In [None]:
waseem_df_to_save = waseem_df[['index', 'Annotation', 'comment', 'toxicity', 'wiki_topic', 'wiki_topic_prob']]

In [None]:
waseem_df_to_save.sample(n = 5)

Unnamed: 0,index,Annotation,comment,toxicity,wiki_topic,wiki_topic_prob
1337,1337,sexism,appears to refer to the idea that denial of vi...,True,5,0.518908
11624,6218,none,aw gamergate harassment squad did i say some...,False,0,0.425006
15860,10454,none,their report is the very definition of cherry ...,False,5,0.347926
4022,592,racism,so the only way the problems of islam can be f...,True,0,0.432325
9037,3631,none,host colin fassnidge in his element at,False,1,0.582365


In [None]:
waseem_df_to_save.to_csv('waseem_wiki_lda_topics_lda_probabilities.csv')