In [None]:
!pip install --upgrade pip

In [None]:
!pip install -q watermark

In [None]:
!pip install --ignore-installed PyYAML

In [None]:
!pip install transformers

In [None]:
%load_ext watermark
%watermark -p torch,pandas

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.callbacks import Callback
import warnings
import os

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Load data
train = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
submission = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')


from transformers import AutoTokenizer

MAX_LEN = 250
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')


train_encodings = tokenizer(train["comment_text"].fillna("fillna").tolist(), truncation=True, padding=True, max_length=MAX_LEN)
test_encodings = tokenizer(test["comment_text"].fillna("fillna").tolist(), truncation=True, padding=True, max_length=MAX_LEN)

x_train = np.array(train_encodings['input_ids'])
x_test = np.array(test_encodings['input_ids'])
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values


EMBEDDING_FILE = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.get_vocab()
max_features = min(150000, len(word_index) + 1)
embed_size = 300
embedding_matrix = np.zeros((max_features, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

class RocAucEvaluation(Callback):
    def __init__(self, training_data=(), validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.X_train, self.y_train = training_data
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            # Validation metrics
            y_val_pred = self.model.predict(self.X_val, verbose=0)
            roc_auc = roc_auc_score(self.y_val, y_val_pred)
            f1 = f1_score(self.y_val, (y_val_pred > 0.5).astype(int), average='macro')
            
            # Training metrics
            y_train_pred = self.model.predict(self.X_train, verbose=0)
            roc_auc_train = roc_auc_score(self.y_train, y_train_pred)
            f1_train = f1_score(self.y_train, (y_train_pred > 0.5).astype(int), average='macro')
            
            print(f"\nEpoch: {epoch+1}")
            print(f"Training ROC-AUC: {roc_auc_train:.6f}, Training F1: {f1_train:.6f}")
            print(f"Validation ROC-AUC: {roc_auc:.6f}, Validation F1: {f1:.6f}\n")


filter_sizes = [1, 2, 3, 5, 7]  
num_filters = 64                

def get_model():    
    inp = Input(shape=(MAX_LEN, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], input_length=MAX_LEN)(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Reshape((MAX_LEN, embed_size, 1))(x)
    
    conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal', activation='elu')(x)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal', activation='elu')(x)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal', activation='elu')(x)
    conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size), kernel_initializer='normal', activation='elu')(x)
    conv_4 = Conv2D(num_filters, kernel_size=(filter_sizes[4], embed_size), kernel_initializer='normal', activation='elu')(x)
    
    maxpool_0 = MaxPool2D(pool_size=(MAX_LEN - filter_sizes[0] + 1, 1))(conv_0)
    maxpool_1 = MaxPool2D(pool_size=(MAX_LEN - filter_sizes[1] + 1, 1))(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(MAX_LEN - filter_sizes[2] + 1, 1))(conv_2)
    maxpool_3 = MaxPool2D(pool_size=(MAX_LEN - filter_sizes[3] + 1, 1))(conv_3)
    maxpool_4 = MaxPool2D(pool_size=(MAX_LEN - filter_sizes[4] + 1, 1))(conv_4)
        
    z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3, maxpool_4])   
    z = Flatten()(z)
    z = Dropout(0.2)(z)  
    z = Dense(1024, activation='relu')(z)  
    z = Dropout(0.2)(z)
        
    outp = Dense(6, activation="sigmoid")(z)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

model = get_model()

batch_size = 512  
epochs = 5        

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.8, random_state=233)
RocAuc = RocAucEvaluation(training_data=(X_tra, y_tra), validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val), callbacks=[RocAuc], verbose=2)

y_pred = model.predict(x_test, batch_size=1024)
