# Loading and splitting the data

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
df = pd.read_csv(r"/content/drive/MyDrive/University/NLP/project/kaggle data/train.csv")

train, test = train_test_split(df, test_size = 0.2, random_state = 1)
train_bert = train['comment_text']
test_bert = test['comment_text']


In [None]:
!pip install -q texthero
!pip install -q scikit-multilearn
! pip install -q transformers
!pip install -q tensorflow-text
!pip install -q tf-models-official

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.4/23.4 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m75.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.2/209.2 kB[0m [31m909.3 kB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for gensim (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-web-sm 3.5.0 requires spacy<3.6.0,>=3.5.0, but you have spacy 2.3.9 which is incompati

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input,Dense, Bidirectional, Embedding, LSTM, BatchNormalization, Dropout
import tensorflow_text as text
from official.nlp import optimization 
import tensorflow_hub as hub
import pandas as pd


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [None]:
# create targets
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y_train = train[labels]
y_test = test[labels]

In [None]:
batch_size = 32
seed = 42

train_ds = tf.data.Dataset.from_tensor_slices((train_bert.values, y_train.values)).shuffle(50000).batch(batch_size)
test_ds = tf.data.Dataset.from_tensor_slices((test_bert.values, y_test.values)).shuffle(50000).batch(batch_size)

## Loading the encoders

In [None]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

## Building the model

In [None]:
# create keras model 
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(500, activation='relu')(net)
  net = tf.keras.layers.Dense(6, activation= "sigmoid" , name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()
classifier_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 preprocessing (KerasLayer)     {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = [tf.metrics.BinaryAccuracy(), tf.metrics.AUC(multi_label=True)]

In [None]:
epochs = 10
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [None]:
callbacks = [tf.keras.callbacks.ModelCheckpoint('best_bert_model', save_best_only=True),
             tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)]

In [None]:
# train the model
bert_history = classifier_model.fit(train_ds, 
                               validation_data=test_ds, 
                               epochs = epochs, 
                               callbacks = callbacks)

Epoch 1/10



Epoch 2/10



Epoch 3/10
Epoch 4/10
Epoch 5/10


In [None]:
bert_model = tf.keras.models.load_model(r'/content/drive/MyDrive/University/NLP/project/best_bert_model', compile = False)

In [None]:
bert_model.compile(optimizer=optimizer, loss = tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=metrics)

In [None]:
loss, accuracy,auc = bert_model.evaluate(test_ds)

  output, from_logits = _get_logits(




In [None]:
import numpy as np
print(f"Accuracy: {np.round(accuracy,4)}")
print(f"AUC: {np.round(auc,4)}")
print(f"Logloss: {np.round(loss,4)}")

Accuracy: 0.983
AUC: 0.9742
Logloss: 0.0423


# Submission

In [None]:
sub_test = pd.read_csv(r'/content/drive/MyDrive/University/NLP/project/data/test.csv')
sub_test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [None]:
sample_sub = pd.read_csv(r'/content/drive/MyDrive/University/NLP/project/data/sample_submission.csv')
sample_sub.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5


In [None]:
pred_test = bert_model.predict(sub_test['comment_text'])



In [None]:
!mkdir .kaggle

In [None]:
!cp kaggle.json ~/.kaggle/

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

Downloading jigsaw-toxic-comment-classification-challenge.zip to /content
 97% 51.0M/52.6M [00:03<00:00, 23.2MB/s]
100% 52.6M/52.6M [00:03<00:00, 15.2MB/s]


In [None]:
labels =['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
def create_submission_file(test_id, predictions, labels, filename):
  pred_df = pd.DataFrame(predictions, columns=labels)
  final_sub =pd.concat([test_id, pred_df], axis = 1)
  final_sub.to_csv(filename, index=False)
  print("Submission file created")

In [None]:
create_submission_file(test_id=sub_test['id'], labels = labels, predictions=pred_test,
                       filename = "bert_results.csv")

Submission file created


In [None]:
!kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f /content/drive/MyDrive/University/NLP/project/bert_results.csv -m " BERT Submission"

100% 13.8M/13.8M [00:05<00:00, 2.60MB/s]
Successfully submitted to Toxic Comment Classification Challenge

## This solution got 0.98296 public score