In [65]:
import numpy as np
import pandas as pd
import pickle
from urllib.parse import urlparse
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from imblearn.over_sampling import RandomOverSampler

In [66]:
import string

def load_data():
    filename = "../../data/formspring_data_fixed.pkl"
    print("Loading data from file: " + filename)
    data = pickle.load(open(filename, 'rb'))
    x_text = []
    labels = []
    ids = []
    for i in range(len(data)):
        text = "".join(l for l in data[i]['text'] if l not in string.punctuation)
        x_text.append((data[i]['text']).encode('utf-8'))
        labels.append(data[i]['label'])
    return x_text,labels

In [67]:
x_text, labels = load_data()

Loading data from file: ../../data/formspring_data_fixed.pkl


In [68]:
labels, uniques = pd.factorize(labels)

In [69]:
comments = pd.DataFrame({'comment': x_text, 'attack': labels})

comments['comment'] = comments['comment'].str.decode("utf-8")

#Replace empty comments with np.nan
comments['comment'].replace('', np.NaN, inplace=True)

#Drop nan(empty) comments
comments = comments[comments.comment.notna()]

# lower case everything
comments['comment'] = comments['comment'].str.lower()

#Remove stop words
comments['comment'] = comments['comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

# Tokenize
comments['comment'] = comments.apply(lambda row: nltk.word_tokenize(row['comment']), axis=1)

# Remove punctuations

comments['comment'] = [list(filter(lambda x: x not in string.punctuation, sentence)) for sentence in comments['comment']]

In [72]:
comments.to_csv("../../data/formspring_data_DLpreprocessed.csv", index=False)

In [73]:
# triple oversample in original paper is similar to equalizing numbers

def oversample(df):
    classes = df.attack.value_counts().to_dict()
    most = max(classes.values())
    classes_list = []
    for key in classes:
        classes_list.append(df[df['attack'] == key])
    classes_sample = []
    for i in range(1,len(classes_list)):
        classes_sample.append(classes_list[i].sample(most, replace=True))
    df_maybe = pd.concat(classes_sample)
    final_df = pd.concat([df_maybe,classes_list[0]], axis=0)
    final_df = final_df.reset_index(drop=True)
    return final_df

comments_oversampled  = oversample(comments)

In [74]:
comments_oversampled

Unnamed: 0,comment,attack
0,"[one, calling, hoe, shit, cuz, bitch, justin, ...",1
1,"[kum, den, wit, dat, big, dick, yew, got, ta, ...",1
2,"[beautiful, girl, dont, think, shoul, things, ...",1
3,"[wait, nevermind, signed, dumb, ass]",1
4,"[bitchezz, beloww, would, say, ur, name, yeww,...",1
...,...,...
23963,"[youre, party, friend, drove, drunk, wo, give,...",0
23964,"[youu2019re, awesome, give, compliment, deserv...",0
23965,"[yu, play, yurself, time, sometimes, day]",0
23966,"[yukk, beer, disgusting, drink, i, 'm, already...",0


In [75]:
comments_oversampled.to_csv("../../data/formspring_data_DLpreprocessed_oversampled.csv", index=False)