# EDA

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split

data_df = pd.read_csv("data/hate/hate-text.csv")
data_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3901 entries, 0 to 3900
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   case_id       3901 non-null   int64 
 1   test_case     3901 non-null   object
 2   label_gold    3901 non-null   object
 3   target_ident  3606 non-null   object
dtypes: int64(1), object(3)
memory usage: 122.0+ KB


In [28]:
# We note that there are rows that are identified as "non-hateful" but not against a certain group, we fill those rows with "no group"
data_df["target_ident"].fillna(value="no group", inplace=True)

In [29]:
data_df["label_gold"].value_counts()

label_gold
hateful        2659
non-hateful    1242
Name: count, dtype: int64

In [30]:
# Checking value counts for hateful comments.
data_df[data_df["label_gold"] == "hateful"]["target_ident"].value_counts()

target_ident
women              388
gay people         388
disabled people    388
Muslims            388
trans people       369
black people       369
immigrants         369
Name: count, dtype: int64

In [31]:
# Checking value counts for non-hateful comments.
data_df[data_df["label_gold"] == "non-hateful"]["target_ident"].value_counts()

# We note that the data is a little bit unbalanced.
# The fact that it is common that non-hateful comments are not directed to a specific group does sound logical though.

target_ident
no group           295
gay people         189
women              147
black people       135
disabled people    122
Muslims            122
trans people       116
immigrants         116
Name: count, dtype: int64

In [32]:
X, y = data_df["test_case"], data_df[["label_gold", "target_ident"]]

In [None]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.15, random_state=42
)

In [60]:
from sklearn.preprocessing import MultiLabelBinarizer

labels = [row.tolist() for _, row in y.iterrows()]

label_encoder = MultiLabelBinarizer()

label_encoder.fit(labels)

label_encoded = label_encoder.transform(labels)

print("Binary Encoded labels:")
print(label_encoded)
type(label_encoded)
#

Binary Encoded labels:
[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


numpy.ndarray

(3901, 2)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
from keras.layers import TextVectorization
from keras.preprocessing.sequence import pad_sequences


VOCAB_SIZE = 1000

# Creating Vectorization instance
encoder = TextVectorization(max_tokens=VOCAB_SIZE, output_mode="int", pad_to_max_tokens=True)
encoder.adapt(X_train)

X_train_vec = encoder(X_train).numpy()
X_val_vec = encoder(X_val).numpy()
X_test_vec = encoder(X_test).numpy()

# padding the X_datasets
MAX_LENGTH = max([len(seq) for seq in X_train_vec])
X_train_padded = pad_sequences(X_train_vec, maxlen=MAX_LENGTH, padding='post')
X_val_padded = pad_sequences(X_val_vec, maxlen=MAX_LENGTH, padding='post')
X_test_padded = pad_sequences(X_test_vec, maxlen=MAX_LENGTH, padding='post')