In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import gc
import sys
import math 
import tensorflow.keras as keras
from tensorflow.keras.layers.experimental import preprocessing
%matplotlib inline
import os
import pathlib
import transformers 
import datasets 
import time 
import tqdm 
from tqdm import tqdm 
import random
from sklearn.model_selection import KFold 
from sklearn.model_selection import StratifiedKFold 
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification


In [None]:
config = {
    'nfolds': 10,
    'batch_size': 32,
    'learning_rate': 1e-4,
    'num_epochs': 3,
    'batch_size': 8,
}
AUTOTUNE = tf.data.experimental.AUTOTUNE
def seed_all(s):
    random.seed(s)
    np.random.seed(s)
    tf.random.set_seed(s)
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    os.environ['PYTHONHASHSEED'] = str(s) 
global_seed = 42
seed_all(global_seed)
df = pd.read_csv('train.csv')
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0 ).astype(int)
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df['y'].value_counts(normalize=True)

min_len = (df['y'] == 1).sum()
df_y0_undersample = df[df['y'] == 0].sample(n=min_len, random_state=global_seed)
train_df = pd.concat([df[df['y'] == 1], df_y0_undersample]).reset_index(drop=True)

n_folds = 10
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=global_seed)
for nfold, (train_index, val_index) in enumerate(skf.split(X=train_df.index,
                                                           y=train_df.y)):
    train_df.loc[val_index, 'fold'] = nfold

p_fold = 0
p_train = train_df.query(f'fold != {p_fold}').reset_index(drop=True)
p_valid = train_df.query(f'fold == {p_fold}').reset_index(drop=True)

checkpoint = "bert-base-uncased"
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)

train_ds = datasets.Dataset.from_pandas(p_train)
valid_ds = datasets.Dataset.from_pandas(p_valid)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_train_ds = train_ds.map(tokenize_function, batched=True)
tokenized_valid_ds = valid_ds.map(tokenize_function, batched=True)

data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer)

tf_train_ds = tokenized_train_ds.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["y"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=config['batch_size'],
)

tf_valid_ds = tokenized_valid_ds.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["y"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=config['batch_size'],
)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
num_epochs = 2
num_train_steps = len(tf_train_ds) * num_epochs

lr_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_scheduler),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
fit_history = model.fit(tf_train_ds,
                        epochs=num_epochs,
                        validation_data=tf_valid_ds,
                        verbose=1)

test_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
test_ds = datasets.Dataset.from_pandas(test_df)
tokenized_test_ds = test_ds.map(tokenize_function, batched=True)
tf_test_ds = tokenized_test_ds.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=config['batch_size'],
)

In [None]:
!pip install detoxify
from detoxify import Detoxify
model = Detoxify('original')
model.predict(['example text','you are a fucker'])
model.predict('you are a fucker')