In [1]:
import numpy as np
import pandas as pd
import pickle
from urllib.parse import urlparse
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from imblearn.over_sampling import RandomOverSampler

In [2]:
import string

def load_data():
    filename = "../../data/twitter_data_fixed.pkl"
    print("Loading data from file: " + filename)
    data = pickle.load(open(filename, 'rb'))
    x_text = []
    labels = []
    ids = []
    for i in range(len(data)):
        text = "".join(l for l in data[i]['text'] if l not in string.punctuation)
        x_text.append((data[i]['text']).encode('utf-8'))
        labels.append(data[i]['label'])
    return x_text,labels

In [3]:
def is_url(url):
  try:
    result = urlparse(url)
    return all([result.scheme, result.netloc])
  except ValueError:
    return False

In [4]:
x_text, labels_og = load_data()

Loading data from file: ../../data/twitter_data_fixed.pkl


In [5]:
labels, uniques = pd.factorize(labels_og)

In [6]:
comments = pd.DataFrame({'comment': x_text, 'attack': labels})

# decode to UTF-8
comments['comment'] = comments['comment'].str.decode("utf-8")

# remove missing rows
comments['comment'].dropna(inplace=True)

# remove usernames
comments['comment'] = comments['comment'].str.replace('(\@\w+.*?)',"", regex=True)

# lower case everything
comments['comment'] = comments['comment'].str.lower()

# remove URLs
comments['comment'] = [' '.join(y for y in x.split() if not is_url(y)) for x in comments['comment']]

# remove stop words
comments['comment'] = comments['comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

# tokenize
tt = TweetTokenizer()
comments['comment'] = [tt.tokenize(entry) for entry in comments['comment']]

# remove punctuation
comments['comment'] = [list(filter(lambda x: x not in string.punctuation, sentence)) for sentence in comments['comment']]

# traditionally, would also lemmatize but this was not done in the main data


  comments['comment'] = comments['comment'].str.replace('(\@\w+.*?)',"")


In [14]:
print(comments)

                                                 comment  attack
0      [rt, another, bloody, instant, restaurant, wee...       0
1      [video, peshmerga, decimating, isis, far, inte...       0
2      [oh, really, instant, restaurants, that's, sho...       0
3      [rt, good, weeks, #isis, new, front, opened, #...       0
4      [rt, don, ’, t, need, femisnsn, men, carry, he...       0
...                                                  ...     ...
16085  [rt, i, want, equal, rights, still, want, seat...       2
16086  [rt, go, ahead, call, sexist, scandalous, wome...       2
16087  [epic, always, kept, plugged, in, plugged, use...       0
16088  [think, daesh, planning, second, battle, trenc...       0
16089  [rt, skin, green, colors, suit, wear, ripped, ...       0

[16090 rows x 2 columns]


In [33]:
# initial preprocessed file
comments.to_csv("../../data/twitter_data_DLpreprocessed.csv", index=False)

In [70]:
# triple oversample in original paper is similar to equalizing numbers

def oversample(df):
    classes = df.attack.value_counts().to_dict()
    most = max(classes.values())
    classes_list = []
    for key in classes:
        classes_list.append(df[df['attack'] == key])
    classes_sample = []
    for i in range(1,len(classes_list)):
        classes_sample.append(classes_list[i].sample(most, replace=True))
    df_maybe = pd.concat(classes_sample)
    final_df = pd.concat([df_maybe,classes_list[0]], axis=0)
    final_df = final_df.reset_index(drop=True)
    return final_df

comments_oversampled  = oversample(comments)

In [71]:
print(comments_oversampled)

                                                 comment  attack
0      [rt, happens, men, time, men, reproductive, ri...       2
1      [rt, i'm, sexist, damn, let, females, act, lik...       2
2      [rt, hate, u, open, door, hot, chick, ugly, fr...       2
3      [tl, wanted, say, ty, great, work, challenging...       2
4                                       [also, existing]       2
...                                                  ...     ...
33103  [great, see, colin, got, bed, get, together, #...       0
33104                            [live, know, ca, drive]       0
33105  [epic, always, kept, plugged, in, plugged, use...       0
33106  [think, daesh, planning, second, battle, trenc...       0
33107  [rt, skin, green, colors, suit, wear, ripped, ...       0

[33108 rows x 2 columns]


In [73]:
# oversampled preprocessed file
comments_oversampled.to_csv("../../data/twitter_data_DLpreprocessed_oversampled.csv", index=False)