In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-insincere-questions-classification/test.csv
/kaggle/input/quora-insincere-questions-classification/sample_submission.csv
/kaggle/input/quora-insincere-questions-classification/train.csv
/kaggle/input/quora-insincere-questions-classification/embeddings.zip


In [2]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer
from sklearn.model_selection import train_test_split




In [3]:
#Tokenize the data and separate them in chunks of 256 units

maxlen=512
chunk_size=256
def fast_encode(texts, tokenizer, chunk_size=chunk_size, maxlen=maxlen):
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    #sliding window methodology
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [4]:
# Create the model

def build_model(transformer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    #Replaced from the Embedding+LSTM/CoNN layers
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

In [5]:
#Detect and deploy

try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [6]:
#allow experimental tf
AUTO = tf.data.experimental.AUTOTUNE

# Data access
GCS_DS_PATH = KaggleDatasets().get_gcs_path()

# Configuration of hyperparameters
EPOCHS = 4
#batch size denotes the partitioning amongst the cluster replicas.
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192

In [7]:
# First load the real tokenizer
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




Tokenizer(vocabulary_size=119547, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=False, wordpieces_prefix=##)

In [8]:
train_df=pd.read_csv('../input/quora-insincere-questions-classification/train.csv')
test_df=pd.read_csv('../input/quora-insincere-questions-classification/test.csv')
train_set,test_set=train_test_split(train_df,test_size=0.2,random_state=2017)
print(train_set.shape)
print(test_set.shape)

(1044897, 3)
(261225, 3)


In [9]:
train_set['question_text'].shape

(1044897,)

In [10]:
train_df['question_text']

0          How did Quebec nationalists see their province...
1          Do you have an adopted dog, how would you enco...
2          Why does velocity affect time? Does velocity a...
3          How did Otto von Guericke used the Magdeburg h...
4          Can I convert montra helicon D to a mountain b...
                                 ...                        
1306117    What other technical skills do you need as a c...
1306118    Does MS in ECE have good job prospects in USA ...
1306119                            Is foam insulation toxic?
1306120    How can one start a research project based on ...
1306121    Who wins in a battle between a Wolverine and a...
Name: question_text, Length: 1306122, dtype: object

In [11]:
train_x = fast_encode(train_set['question_text'].astype(str), fast_tokenizer, maxlen=MAX_LEN)
val_x = fast_encode(test_set['question_text'].astype(str), fast_tokenizer, maxlen=MAX_LEN)
train_y=train_set['target'].values
val_y=test_set['target'].values

HBox(children=(FloatProgress(value=0.0, max=4082.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1021.0), HTML(value='')))




In [12]:
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)

(1044897, 192)
(1044897,)
(261225, 192)
(261225,)


In [13]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((train_x, train_y))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((val_x, val_y))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)



In [14]:
print(train_dataset)
print(valid_dataset)

<PrefetchDataset shapes: ((None, 192), (None,)), types: (tf.int64, tf.int64)>
<PrefetchDataset shapes: ((None, 192), (None,)), types: (tf.int64, tf.int64)>


In [15]:

with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-multilingual-cased')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=910749124.0, style=ProgressStyle(descri…


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_________________________________________________________________
tf_distil_bert_model (TFDist ((None, 192, 768),)       134734080 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 769       
Total params: 134,734,849
Trainable params: 134,734,849
Non-trainable params: 0
_________________________________________________________________
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_______________________________

In [None]:
n_steps = train_x.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

In [None]:
n_steps = val_x.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS*2
)

In [None]:

tokenizer = transformers.XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

tokenizer.save_pretrained('.')
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

In [None]:

train_df=pd.read_csv('../input/quora-insincere-questions-classification/train.csv')
test_df=pd.read_csv('../input/quora-insincere-questions-classification/test.csv')
train_set,test_set=train_test_split(train_df,test_size=0.2,random_state=2017)
print(train_set.shape)
print(test_set.shape)

In [None]:

train_x = fast_encode(train_set['question_text'].astype(str), fast_tokenizer, maxlen=MAX_LEN)
val_x = fast_encode(test_set['question_text'].astype(str), fast_tokenizer, maxlen=MAX_LEN)
train_y=train_set['target'].values
val_y=test_set['target'].values

In [None]:

train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((train_x, train_y))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((val_x, val_y))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

print(train_dataset)
print(valid_dataset)


In [None]:

with strategy.scope():
    transformer_layer = (
        transformers.TFRobertaModel
        .from_pretrained('roberta-base')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

In [None]:


n_steps = train_x.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

In [None]:


n_steps = val_x.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

In [16]:
!pip install transformerS

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [17]:

tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-multilingual-cased')

tokenizer.save_pretrained('.')

fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

Tokenizer(vocabulary_size=119547, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=False, wordpieces_prefix=##)

In [18]:

train_df=pd.read_csv('../input/quora-insincere-questions-classification/train.csv')
test_df=pd.read_csv('../input/quora-insincere-questions-classification/test.csv')
train_set,test_set=train_test_split(train_df,test_size=0.2,random_state=2017)
print(train_set.shape)
print(test_set.shape)

(1044897, 3)
(261225, 3)


In [19]:

train_x = fast_encode(train_set['question_text'].astype(str), fast_tokenizer, maxlen=MAX_LEN)
val_x = fast_encode(test_set['question_text'].astype(str), fast_tokenizer, maxlen=MAX_LEN)
train_y=train_set['target'].values
val_y=test_set['target'].values
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)


HBox(children=(FloatProgress(value=0.0, max=4082.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1021.0), HTML(value='')))


(1044897, 192)
(1044897,)
(261225, 192)
(261225,)


In [20]:

train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((train_x, train_y))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((val_x, val_y))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

print(train_dataset)
print(valid_dataset)

<PrefetchDataset shapes: ((None, 192), (None,)), types: (tf.int64, tf.int64)>
<PrefetchDataset shapes: ((None, 192), (None,)), types: (tf.int64, tf.int64)>


In [21]:

with strategy.scope():
    transformer_layer = (
        transformers.TFBertModel
        .from_pretrained('bert-base-multilingual-cased')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1083389348.0, style=ProgressStyle(descr…


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_________________________________________________________________
tf_bert_model (TFBertModel)  ((None, 192, 768), (None, 177853440 
_________________________________________________________________
tf_op_layer_strided_slice_1  [(None, 768)]             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 769       
Total params: 177,854,209
Trainable params: 177,854,209
Non-trainable params: 0
_________________________________________________________________
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
___________________________

In [24]:


n_steps = train_x.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [25]:


n_steps = val_x.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [26]:

tokenizer = transformers.AlbertTokenizer.from_pretrained('albert-base-v1')

tokenizer.save_pretrained('.')

fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760289.0, style=ProgressStyle(descripti…




Tokenizer(vocabulary_size=119547, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=False, wordpieces_prefix=##)

In [27]:

train_df=pd.read_csv('../input/quora-insincere-questions-classification/train.csv')
test_df=pd.read_csv('../input/quora-insincere-questions-classification/test.csv')
train_set,test_set=train_test_split(train_df,test_size=0.2,random_state=2017)
print(train_set.shape)
print(test_set.shape)

(1044897, 3)
(261225, 3)


In [28]:

train_x = fast_encode(train_set['question_text'].astype(str), fast_tokenizer, maxlen=MAX_LEN)
val_x = fast_encode(test_set['question_text'].astype(str), fast_tokenizer, maxlen=MAX_LEN)
train_y=train_set['target'].values
val_y=test_set['target'].values
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)


HBox(children=(FloatProgress(value=0.0, max=4082.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1021.0), HTML(value='')))


(1044897, 192)
(1044897,)
(261225, 192)
(261225,)


In [29]:

train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((train_x, train_y))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((val_x, val_y))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

print(train_dataset)
print(valid_dataset)

<PrefetchDataset shapes: ((None, 192), (None,)), types: (tf.int64, tf.int64)>
<PrefetchDataset shapes: ((None, 192), (None,)), types: (tf.int64, tf.int64)>


In [30]:

with strategy.scope():
    transformer_layer = (
        transformers.TFAlbertModel
        .from_pretrained('albert-base-v1')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_________________________________________________________________
tf_albert_model_1 (TFAlbertM ((None, 192, 768), (None, 11683584  
_________________________________________________________________
tf_op_layer_strided_slice_3  [(None, 768)]             0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 769       
Total params: 11,684,353
Trainable params: 11,684,353
Non-trainable params: 0
_________________________________________________________________
Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
______________________________

In [31]:


n_steps = train_x.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [32]:

n_steps = val_x.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [33]:

tokenizer = transformers.BartTokenizer.from_pretrained('facebook/bart-base')

tokenizer.save_pretrained('.')
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




Tokenizer(vocabulary_size=119547, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=False, wordpieces_prefix=##)

In [34]:

train_df=pd.read_csv('../input/quora-insincere-questions-classification/train.csv')
test_df=pd.read_csv('../input/quora-insincere-questions-classification/test.csv')
train_set,test_set=train_test_split(train_df,test_size=0.2,random_state=2017)
print(train_set.shape)
print(test_set.shape)

(1044897, 3)
(261225, 3)


In [35]:

train_x = fast_encode(train_set['question_text'].astype(str), fast_tokenizer, maxlen=MAX_LEN)
val_x = fast_encode(test_set['question_text'].astype(str), fast_tokenizer, maxlen=MAX_LEN)
train_y=train_set['target'].values
val_y=test_set['target'].values
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)


HBox(children=(FloatProgress(value=0.0, max=4082.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1021.0), HTML(value='')))


(1044897, 192)
(1044897,)
(261225, 192)
(261225,)


In [36]:

train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((train_x, train_y))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((val_x, val_y))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

print(train_dataset)
print(valid_dataset)

<PrefetchDataset shapes: ((None, 192), (None,)), types: (tf.int64, tf.int64)>
<PrefetchDataset shapes: ((None, 192), (None,)), types: (tf.int64, tf.int64)>


In [37]:

with strategy.scope():
    transformer_layer = (
        transformers.TFAlbertModel
        .from_pretrained('albert-base-v1')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_________________________________________________________________
tf_albert_model_2 (TFAlbertM ((None, 192, 768), (None, 11683584  
_________________________________________________________________
tf_op_layer_strided_slice_4  [(None, 768)]             0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 769       
Total params: 11,684,353
Trainable params: 11,684,353
Non-trainable params: 0
_________________________________________________________________
Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
______________________________

In [38]:

n_steps = train_x.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [39]:


n_steps = val_x.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
