In [2]:
import pandas as pd
import urllib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import pickle
import preprocessor as p
import numpy as np
import nltk

from textblob import TextBlob

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\weetb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
"""
convert dos linefeeds (crlf) to unix (lf)
usage: dos2unix.py
"""
original = "../../data/twitter_data.pkl"
destination = "../../data/twitter_data_fixed.pkl"

content = ''
outsize = 0
with open(original, 'rb') as infile:
    content = infile.read()
with open(destination, 'wb') as output:
    for line in content.splitlines():
        outsize += len(line) + 1
        output.write(line + str.encode('\n'))

print("Done. Saved %s bytes." % (len(content)-outsize))

Done. Saved 112632 bytes.


In [4]:
import string

def load_data():
    filename = "../../data/twitter_data_fixed.pkl"
    print("Loading data from file: " + filename)
    data = pickle.load(open(filename, 'rb'))
    x_text = []
    labels = []
    ids = []
    for i in range(len(data)):
        text = "".join(l for l in data[i]['text'] if l not in string.punctuation)
        x_text.append((data[i]['text']).encode('utf-8'))
        labels.append(data[i]['label'])
    return x_text,labels

In [5]:
x_text, labels_og = load_data()
dict1 = {'racism':1,'sexism':1,'none':0}  # Both are 1 because both are cases of bullying
labels = np.array([dict1[b] for b in labels_og])

Loading data from file: ../../data/twitter_data_fixed.pkl


In [6]:
comments = pd.DataFrame({'comment': x_text, 'attack': labels})
comments['comment'] = comments['comment'].str.decode("utf-8")

In [7]:
comments.head()

Unnamed: 0,comment,attack
0,rt @colonelkickhead: another bloody instant re...,0
1,@azzamalirhabi @jihadia8 this video of the pes...,0
2,oh really ? no more instant restaurants ? th...,0
3,rt @benfrancisallen: it has not been a good fe...,0
4,rt @notofeminism: i don’t need femisnsn becaus...,0


In [8]:
swear_words = "../../swear_words.txt"
word_list = []
with open(swear_words) as f:
    word_list = f.read().splitlines()

In [9]:
def tokens(post):
    return TextBlob(post).words

def get_bad_word_count(post):
    x = tokens(post)
    count = 0
    bad_words = []
    for word in x:
        if word in word_list:
            count+=1
            bad_words.append(word)
    return count, str(bad_words)

comments['bad_word_count'],comments['bad_word_list']= zip(*comments['comment'].map(get_bad_word_count))

In [10]:
comments['length'] = comments['comment'].str.split().str.len()

In [19]:
print("Number of posts: " + str(len(comments)))
print("Classes: " + str(len(pd.unique(labels_og))))
print("Length 95%: " + str(comments.length.quantile(0.95)))
print("Max length: " + str(comments.length.max()))

# this removes all usernames from the vocab count. previous papers also show a list of stopwords but can't find that at the moment
cleaned_comments = comments.comment.str.replace('(\@\w+.*?)',"")

print("Vocabulary: " + str(len(pd.unique(cleaned_comments.str.split(expand=True).stack()))))

Number of posts: 16090
Classes: 3
Length 95%: 27.0
Max length: 38


  cleaned_comments = comments.comment.str.replace('(\@\w+.*?)',"")


Vocabulary: 26028


In [11]:
def data_analysis(comments):
    total = len(comments)
    bully = len(comments[comments['attack']==1])
    swear = len(comments[comments['bad_word_count']>0])
    swear_bully = len(comments[(comments['bad_word_count']>0) & (comments['attack']==1) ])

    P_B = float(bully)/total
    P_S = float(swear)/total
    P_B_S = float(swear_bully)/swear
    P_S_B = float(swear_bully)/bully

    print("P(B): " + str(P_B))
    print("P(S): " + str(P_S))
    print("P(B|S): " + str(P_B_S))
    print("P(S|B): " + str(P_S_B))

In [12]:
data_analysis(comments)

P(B): 0.31410814170292106
P(S): 0.135052827843381
P(B|S): 0.42982052462034054
P(S|B): 0.184804115552038


In [13]:
#new swearing calc
truncated = comments["comment"].apply(lambda x: " ".join(x.split()[:int(comments.length.quantile(0.95))]))
df = pd.concat([truncated, comments.iloc[:,1:]], axis=1)

In [35]:
df["attack"] = df["attack"].astype(bool)

In [15]:
bwc, bwl = zip(*comments["comment"].map(get_bad_word_count))
has_bw = pd.Series(np.array(bwc) > 0, index=df.index)
has_swears = df["comment"].apply(lambda x: any([swear in x for swear in word_list]))
swears = pd.concat([has_bw, has_swears], axis=1, keys=["original", "new"])

In [19]:
np.sum(np.array(bwc) > 0) / len(bwc)
print("Comparison value for swear share: ", has_swears.value_counts(normalize=True).round(2)[True])

Comparison value for swear share:  0.36


In [38]:
pb = df["attack"].value_counts(normalize=True)[True]
ps1 = swears["original"].value_counts(normalize=True)[True]
ps2 = swears["new"].value_counts(normalize=True)[True]

pb_s1 = df["attack"][swears["original"]].value_counts(normalize=True)[True]
pb_s2 = df["attack"][swears["new"]].value_counts(normalize=True)[True]
ps1_b = swears["original"][df["attack"]].value_counts(normalize=True)[True]
ps2_b = swears["new"][df["attack"]].value_counts(normalize=True)[True]

table2 = pd.DataFrame([[pb, ps1, pb_s1, ps1_b],
                    [pb, ps2, pb_s2, ps2_b]],
                   columns = ["P(B)", "P(S)", "P(B|S)", "P(S|B)"],
                   index = ["original", "new"])
print(table2.T.round(3))

        original    new
P(B)       0.314  0.314
P(S)       0.135  0.362
P(B|S)     0.430  0.456
P(S|B)     0.185  0.525
