In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf
from os import path
import pickle

BASE_DATASET_PATH = "..\\datasets"
TRAIN_DATASET_PATH = path.join(BASE_DATASET_PATH,"train_processed.csv")
TEST_DATASET_PATH = path.join(BASE_DATASET_PATH,"test_processed.csv")
DATA_COLUMNS = ["comment_text"]
LABEL_COLUMNS = ["toxic",'severe_toxic','obscene','threat','insult','identity_hate']

In [2]:
train_dataset = pd.read_csv(TRAIN_DATASET_PATH)
train_dataset.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww match background colour im seemingly stuc...,0,0,0,0,0,0
2,000113f07ec002fd,hey man im really trying edit war guy constant...,0,0,0,0,0,0
3,0001b41b1c6bb37e,cant make real suggestion improvement wondered...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chance remember page thats,0,0,0,0,0,0


In [3]:
def fix_dataset_type(dataset):
    cols = dataset.select_dtypes(include=['object'])
    for col in cols.columns.values:
        dataset[col] = dataset[col].fillna('')
    return dataset
train_dataset = fix_dataset_type(train_dataset)

In [4]:
x_train = np.array(train_dataset[DATA_COLUMNS])
y_train = np.array(train_dataset[LABEL_COLUMNS])
x_train = x_train.flatten()
x_train

array(['explanation edits made username hardcore metallica fan reverted werent vandalism closure gas voted new york doll fac please dont remove template talk page since im retired now892053827',
       'daww match background colour im seemingly stuck thanks talk 2151 january 11 2016 utc',
       'hey man im really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting actual info',
       ...,
       'spitzer umm there actual article prostitution ring crunch captain',
       'look like actually put speedy first version deleted look',
       'really dont think understand came idea bad right away kind community go bad idea go away instead helping rewrite'],
      dtype=object)

In [5]:
from keras.layers import TextVectorization

vectorizer = TextVectorization(output_sequence_length=1800,output_mode='int')
vectorizer.adapt(x_train)


In [7]:
# Note - On windows this might not work due to encoding problem.
# In order to fix that, you need to go to language settings -> Administrative language settings -> Change system locale -> and set checkbox of "Use Unicode UTF-8..."
MAX_FEATURES = len(vectorizer.get_vocabulary())
MAX_FEATURES

245355

In [None]:
pickle.dump({
    'config':vectorizer.get_config(),
    'weights':vectorizer.get_weights()
}, open("../trained_models/text_vectorizer.pkl","wb"))

In [7]:

x_train = vectorizer(x_train)
x_train

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[   464,     55,     58, ...,      0,      0,      0],
       [207879,    966,   1210, ...,      0,      0,      0],
       [   322,    321,     16, ...,      0,      0,      0],
       ...,
       [ 29048,   6754,    278, ...,      0,      0,      0],
       [    50,      9,    125, ...,      0,      0,      0],
       [    61,     10,     13, ...,      0,      0,      0]], dtype=int64)>

In [8]:
from tensorflow import data
SHUFFLE_BUFFER_SIZE=160000
BATCH_SIZE = 32
PREFETCH=8
train_dataset = data.Dataset.from_tensor_slices((x_train,y_train))
train_dataset = train_dataset.shuffle(buffer_size=SHUFFLE_BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(PREFETCH)
len(train_dataset)

4987

In [9]:
train_dataset = train_dataset.take(int(len(train_dataset)*0.8))
validation_dataset = train_dataset.skip(int(len(train_dataset)*0.8)).take(int(len(train_dataset)*0.2))
print(len(train_dataset))
print(len(validation_dataset))

3989
797


In [10]:

# from keras.models import Sequential
# from keras.layers import Dense,LSTM,Bidirectional,Embedding

# model = Sequential()

# model.add(Embedding(MAX_FEATURES+1,32))
# model.add(Bidirectional(LSTM(32,activation='tanh')))
# model.add(Dense(128,activation='relu'))
# model.add(Dense(256,activation='relu'))
# model.add(Dense(128,activation='relu'))

# model.add(Dense(6,activation='sigmoid'))
# model.compile(loss="BinaryCrossentropy",optimizer="adam")
# model.summary()


In [11]:
# %%time
# # The training process was extremely long (1 hour per epoch) so I trained the model on google colab and used GPU acceleration
# history = model.fit(train_dataset,epochs=5,batch_size=BATCH_SIZE,validation_data=validation_dataset)

In [12]:
# from pathlib import Path

# model.save("text_moderation_model.h5",save_format="h5")
# model_structure = model.to_json()
# f = Path("text_moderation_structure.json")
# f.write_text(model_structure)

In [17]:
from keras.models import model_from_json
from pathlib import Path

f = Path("../trained_models/text_moderation_structure_gc.json")
model_structure = f.read_text()
trained_model = model_from_json(model_structure)
trained_model.load_weights("../trained_models/text_moderation_model_gc.h5")


In [18]:
test_dataset = pd.read_csv(TEST_DATASET_PATH)
test_dataset = fix_dataset_type(test_dataset)
x_test = np.array(test_dataset[DATA_COLUMNS]).flatten()
x_test = vectorizer(x_test)


In [19]:
test_dataset = data.Dataset.from_tensor_slices(x_test)
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.prefetch(PREFETCH)
len(test_dataset)

4787

In [31]:
predictions = trained_model.predict(test_dataset.take(100))



In [35]:
TEST_LABELS_PATH = path.join(BASE_DATASET_PATH,"test_labels.csv")
test_labels_dataset = pd.read_csv(TEST_LABELS_PATH)
true_labels = test_labels_dataset[LABEL_COLUMNS].iloc[0:3200]
true_labels = true_labels.applymap(lambda x: 1 if x==-1 else x)
true_labels.head(10)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,1,1,1,1,1,1
1,1,1,1,1,1,1
2,1,1,1,1,1,1
3,1,1,1,1,1,1
4,1,1,1,1,1,1
5,0,0,0,0,0,0
6,1,1,1,1,1,1
7,0,0,0,0,0,0
8,1,1,1,1,1,1
9,1,1,1,1,1,1


In [41]:
true_labels = np.array(true_labels).flatten()
predictions = (predictions>0.5).astype(int)
predictions = predictions.flatten()
correct = [y==y_hat for (y,y_hat) in zip(true_labels,predictions)]
# accuracy = (correct/len(predictions))*100
# accuracy

19200

In [None]:

# pred = trained_model.predict(tf.reshape(x_test[0:4],(4,1800)),batch_size=1)
# # trained_model.predict()
# pred = (pred>0.5).astype(int)
# pred
# x_test[0:4]
# test_dataset.as_numpy_iterator().next()



array([[    1,    49,   166, ...,     0,     0,     0],
       [14582,  3203,   296, ...,     0,     0,     0],
       [  816,   929,  2861, ...,     0,     0,     0],
       ...,
       [ 7180,    16,   703, ...,     0,     0,     0],
       [  931, 34024,   169, ...,     0,     0,     0],
       [91499,   981,   322, ...,     0,     0,     0]], dtype=int64)