In [1]:
import numpy as np
import random

from data.utils import split_dataset_binary
from data.preprocess import concat_unshared_task_datasets, save_preprocessed_data, preprocess_tweet

In [2]:
data = concat_unshared_task_datasets()

Unshared task dataset concat done.
Label Count: Sexism-3941, Racism-2062, None-12970


In [3]:
idx_racism = list(range(len(data["racism"])))
idx_sexism = list(range(len(data["sexism"])))
idx_none = list(range(len(data["none"])))

random.shuffle(idx_racism)
random.shuffle(idx_sexism)
random.shuffle(idx_none)

## Preprocess

In [12]:
def preprocess(tweets):
    hashtag = []
    no_hashtag = []
    for tweet in tweets:
        _tweet = preprocess_tweet(tweet, removeHashTag=False)
        __tweet = preprocess_tweet(tweet, removeHashTag=True)
        if _tweet and __tweet:
            hashtag.append(_tweet)
            no_hashtag.append(__tweet)
    assert len(hashtag) == len(no_hashtag)
    return hashtag, no_hashtag

In [13]:
def save_preprocessed_train_test_set(name, x_neg, x_pos):
    x_train, y_train, x_valid, y_valid, x_test, y_test = split_dataset_binary(x_neg=x_neg, x_pos=x_pos, split=[0.8, 0.1, 0.1])
    x_test = np.concatenate((x_valid, x_test))
    y_test = np.concatenate((y_valid, y_test))
    x_train_yes, x_train_no = preprocess(x_train)
    x_test_yes, x_test_no = preprocess(x_test)
    
    save_preprocessed_data(name + "_yes", hasValid=False, data_={"train": (x_train_yes, y_train), "test": (x_test_yes, y_test)})
    save_preprocessed_data(name + "_no", hasValid=False, data_={"train": (x_train_no, y_train), "test": (x_test_no, y_test)})

## Sexism binary test

In [14]:
save_preprocessed_train_test_set("sexism_hashtag", data["none"], data["sexism"])

split index
[2594, 5188, 7782, 10376]
divided into chunk shape _x: (2594,), _y: (2594,)
divided into chunk shape _x: (2594,), _y: (2594,)
divided into chunk shape _x: (2594,), _y: (2594,)
divided into chunk shape _x: (2594,), _y: (2594,)
divided into chunk shape _x: (2594,), _y: (2594,)
concatenating chunks
merged and splitted to shape x_train:(10375,), y_train:(10375,), x_test:(2595,), y_test:(2595,)
Training set size:13527 (neg:10375/pos:3152)

sample neg - Wow. I might be out of blue hair dye. But I have a tube of Pravana that is purple... // @TheQuinnspiracy
sample pos - RT @Vickisecret__: These NSW promo girls think way too highly of themselves.. They're not even attractive 😳 #MKR #MKR2015
Valid set size:1691 (neg:1297/pos:394)

sample neg - @LYH786 Looks much to intelligent to be Baghdadi ;)
sample pos - RT @RykerDomz When a woman gets in a wreck I'm not surprised. #notsexist
Test set size:1693 (neg:1298/pos:395)

sample neg - Can a camper please push #katandandre into that lake?

## Racism Binary test

In [15]:
save_preprocessed_train_test_set("racism_hashtag", data["none"], data["racism"])

split index
[2594, 5188, 7782, 10376]
divided into chunk shape _x: (2594,), _y: (2594,)
divided into chunk shape _x: (2594,), _y: (2594,)
divided into chunk shape _x: (2594,), _y: (2594,)
divided into chunk shape _x: (2594,), _y: (2594,)
divided into chunk shape _x: (2594,), _y: (2594,)
concatenating chunks
merged and splitted to shape x_train:(10375,), y_train:(10375,), x_test:(2595,), y_test:(2595,)
Training set size:12024 (neg:10375/pos:1649)

sample neg - I have faith in the project, but more awareness needs to be spread about the difficulties and harassment that many female FOSS devs face.
sample pos - @QrysBinThynkn That's because it's all about the barbarity of Islam, not the beards.
Valid set size:1503 (neg:1297/pos:206)

sample neg - RT @octal: Wow. This really isn't how to do customer support. https://t.co/fHLfHyunuu
sample pos - @Vandaliser @sajid_fairooz @IsraeliRegime Science was moving forward in India and Persia before Islam. Islam only slowed it down.
Test set size:1505

## Test Loading 

In [16]:
%ls data/preprocessed

test_abusive_final_binary.txt   train_racism_binary.txt
test_racism_binary.txt          train_racism_final2_binary.txt
test_racism_final2_binary.txt   train_racism_final_binary.txt
test_racism_final_binary.txt    train_racism_hashtag_no.txt
test_racism_hashtag_no.txt      train_racism_hashtag_yes.txt
test_racism_hashtag_yes.txt     train_sexism_binary.txt
test_sexism_binary.txt          train_sexism_final2_binary.txt
test_sexism_final2_binary.txt   train_sexism_final_binary.txt
test_sexism_final_binary.txt    train_sexism_hashtag_no.txt
test_sexism_hashtag_no.txt      train_sexism_hashtag_yes.txt
test_sexism_hashtag_yes.txt     train_sexism_no_ht_binary.txt
test_sexism_no_ht_binary.txt    valid_racism_binary.txt
train_abusive_final_binary.txt  valid_sexism_binary.txt


In [17]:
from data.preprocess import load_from_file

In [28]:
data = load_from_file("sexism_hashtag_no")
data2 = load_from_file("sexism_hashtag_yes")

In [29]:
data["x_train"][-10:]

array([ "i didn't know you could throw interceptions and make tackles in the kitchen.",
       'define feminazi',
       "i'm not sexist but don't go around saying wemon are top shit and yous run the world. when it's men that literally run …",
       'ha ha suck on that kat and andre you dumb pieces of shit.',
       'is bad because i should be able to hate entire demographics of people for the bad things individuals do to me!',
       "if you are a women,and this is no sexist comment directly, and we are watching football just don't talk unless my team scores",
       "i'm really not sexist, but most women can't drive for shit!!",
       'so...is the answer "nothing"?',
       "i support because i'm a gamer and i don't want a bunch of corrupt media feminazi's ruining video games for everyone.",
       "these nsw promo girls think way too highly of themselves.. they're not even attractive 😳"], 
      dtype='<U153')

In [30]:
data2["x_train"][-10:]

array([ "i didn't know you could throw interceptions and make tackles in the kitchen.",
       'define feminazi',
       "i'm not sexist but don't go around saying wemon are top shit and yous run the world. when it's men that literally run …",
       'ha ha suck on that kat and andre you dumb pieces of shit. mkr dumbslut',
       'blameonenotall is bad because i should be able to hate entire demographics of people for the bad things individuals do to me! racismyall',
       "if you are a women,and this is no sexist comment directly, and we are watching football just don't talk unless my team scores",
       "i'm really not sexist, but most women can't drive for shit!!",
       'so...is the answer "nothing"?',
       "i support gamergate because i'm a gamer and i don't want a bunch of corrupt media feminazi's ruining video games for everyone.",
       "these nsw promo girls think way too highly of themselves.. they're not even attractive 😳 mkr mkr2015"], 
      dtype='<U157')