In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

2023-10-15 20:55:05.073294: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


In [2]:
from sklearn.metrics import roc_curve, auc

In [3]:
def roc_auc(predictions, target):
    fpr, tpr, thresholds = roc_curve(target, predictions)
    roc_auc = auc(fpr, tpr)
    return roc_auc

In [4]:
train = pd.read_csv('./jigsaw-toxic-comment-train.csv')
train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)
train = train.loc[:12000,:]

valid = pd.read_csv('./validation.csv')
test = pd.read_csv('./test.csv')

In [5]:
train['comment_text'].apply(lambda x: len(str(x).split())).max()

1403

In [6]:
x_train, x_valid, y_train, y_valid = train_test_split(
    train.comment_text.values, train.toxic.values,
    stratify = train.toxic.values,
    random_state = 42,
    test_size = 0.2, shuffle = True
)

In [7]:
from keras.preprocessing import text
from keras.utils import pad_sequences

token = text.Tokenizer(num_words=None)
max_len = 1500

token.fit_on_texts(list(x_train) + list(x_valid))
x_train_seq = token.texts_to_sequences(x_train)
x_valid_seq = token.texts_to_sequences(x_valid)

x_train_pad = pad_sequences(x_train_seq, maxlen=max_len)
x_valid_pad = pad_sequences(x_valid_seq, maxlen=max_len)

word_index = token.word_index

In [8]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding

In [9]:
embeds = {}
with open('./glove.840B.300d.txt') as f:
        for line in tqdm(f):
            values = line.split(' ')
            word = values[0]
            coefs = np.asarray([float(val) for val in values[1:]])
            embeds[word] = coefs

print(f"There are {len(embeds)} word embeddings")

2196017it [01:25, 25567.41it/s]

There are 2196016 word embeddings





In [10]:
##Create matrix for existing words

mat = np.zeros((len(word_index)+1, 300))
for word, i in tqdm(word_index.items()):
    embvec = embeds.get(word)
    if embvec is not None:
        mat[i] = embvec

100%|██████████████████████████████| 43496/43496 [00:00<00:00, 445527.39it/s]


In [11]:
model = Sequential()
model.add(Embedding(len(word_index)+1, 300,
                    weights = [mat],
                    input_length=max_len,
                    trainable=False))
model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

2023-10-15 20:56:35.109644: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355




2023-10-15 20:56:35.142183: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-15 20:56:35.142394: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-15 20:56:35.144339: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1500, 300)         13049100  
                                                                 
 lstm (LSTM)                 (None, 128)               219648    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 13,268,877
Trainable params: 219,777
Non-trainable params: 13,049,100
_________________________________________________________________


In [12]:
model.fit(x_train_pad, y_train, epochs=3, batch_size=16)

Epoch 1/3


2023-10-15 20:56:37.282884: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-10-15 20:56:37.894945: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f974d00d3d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-10-15 20:56:37.894971: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2023-10-15 20:56:37.900868: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-10-15 20:56:37.912715: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8902
2023-10-15 20:56:38.011664: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifeti

Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f99b5c3f490>

In [13]:
scores = model.predict(x_valid_pad)
print("AUC: %.2f%%" % (roc_auc(scores, y_valid)))

AUC: 0.97%


In [16]:
## Jesus this took almost 40 minutes to train why did I decide to do this...
## Let's also test out GRUs!!

In [21]:
from keras.layers import SpatialDropout1D, GRU

In [24]:
model_gru = Sequential()
model_gru.add(Embedding(len(word_index) + 1, 300,
             weights = [mat],
             input_length = max_len,
             trainable = False
             ))
model_gru.add(SpatialDropout1D(0.3))
model_gru.add(GRU(300, recurrent_dropout=0.3))
model_gru.add(Dense(1, activation='sigmoid'))

model_gru.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_gru.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 1500, 300)         13049100  
                                                                 
 spatial_dropout1d_3 (Spatia  (None, 1500, 300)        0         
 lDropout1D)                                                     
                                                                 
 gru_2 (GRU)                 (None, 300)               541800    
                                                                 
 dense_1 (Dense)             (None, 1)                 301       
                                                                 
Total params: 13,591,201
Trainable params: 542,101
Non-trainable params: 13,049,100
_________________________________________________________________


In [25]:
model_gru.fit(x_train_pad, y_train, epochs=3, batch_size=32)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f98de59f290>

In [26]:
## So why did batch_size=32 work with GRU but not with LSTM?
## GRU takes up less VRAM than LSTM?

In [27]:
scores_gru = model_gru.predict(x_valid_pad)
print("AUC: %.2f%%" % (roc_auc(scores_gru, y_valid)))

AUC: 0.98%


In [None]:
## WOW! Can I push this even further?