# Data Source - Wikipedia Talk Labels: Personal Attacks

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import TfidfVectorizer
import config as cfg

In [2]:
comments = pd.read_csv(cfg.RESOURCE.wiki_talk_personal_attack_comments, sep='\t', index_col=0)
annotations = pd.read_csv(cfg.RESOURCE.wiki_talk_personal_attack_annotations, sep='\t')
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5
labels

rev_id
37675        False
44816        False
49851        False
89320        False
93890        False
             ...  
699848324    False
699851288    False
699857133    False
699891012    False
699897151    False
Name: attack, Length: 115864, dtype: bool

In [3]:
comments['attack'] = labels
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))
all_comments_df = comments.query("split=='train' or split == 'dev' or split=='test'")
all_comments = all_comments_df['comment'].values
all_labels = all_comments_df['attack'].values

In [4]:
vec = TfidfVectorizer(ngram_range=(2,2), tokenizer=None, max_features=10000,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, stop_words='english')
all_comments_tf_idf = vec.fit_transform(all_comments)

## Mutual information score ranking (personal attack) (top 100)

In [5]:
mi = mutual_info_classif(all_comments_tf_idf, all_labels.astype(int), n_neighbors=3, random_state=2020)

In [6]:
top_mi = np.argsort(mi)[::-1][:100]
features = np.array(vec.get_feature_names())
features[top_mi]

array(['talk page', 'don know', 'piece shit', 'people like',
       'personal attacks', 'don think', 'http www', 'personal attack',
       'fuck fuck', 'wikipedia org', 'user page', 'en wikipedia',
       'don care', 'don like', 'talk pages', 'don want', 'ip address',
       'feel free', 'good faith', 'http en', 'blocked editing',
       'editing wikipedia', 'suck dick', 'org wiki', 'just like',
       'reliable sources', 'background color', 'son bitch', 'point view',
       'let know', 'year old', 'border 1px', '1px solid', 'looks like',
       'vertical align', 'style vertical', 'shut fuck', 'comment added',
       'unsigned comment', 'edit page', 'preceding unsigned',
       'edit summary', 'reliable source', 'discussion page',
       'don understand', 'user talk', 'style background', 'font size',
       'align middle', 'look like', 'make sure', 'color fdffe7',
       'mother fucker', 'fuck wikipedia', 'speedy deletion', 'waste time',
       'style font', 'wikipedia articles', 'welc