In [None]:
!pip install -q watermark
!pip install --ignore-installed PyYAML
!pip install transformers
%load_ext watermark
%watermark -p torch,pandas

In [None]:
!pip cache purge

In [None]:
!pip install --upgrade transformers huggingface_hub


In [None]:
!pip install virtualenv
!virtualenv myenv
!source myenv/bin/activate

In [None]:
!pip install --upgrade pip

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
import torch.nn as nn
import os
from keras.models import Model
from keras.layers import Input, Embedding, Dense, LSTM, GRU, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, add, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler

# Hyperparameters
MAX_LEN = 250
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = TRAIN_BATCH_SIZE * 2
EPOCHS = 5
LEARNING_RATE = 1e-05
EMBEDDING_FILE = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print("Device:", DEVICE)

# Load data
train = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')

# Use AutoTokenizer from Hugging Face
hf_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize data
train_encodings = hf_tokenizer(train["comment_text"].fillna("fillna").tolist(), truncation=True, padding=True, max_length=MAX_LEN)
test_encodings = hf_tokenizer(test["comment_text"].fillna("fillna").tolist(), truncation=True, padding=True, max_length=MAX_LEN)

x_train = np.array(train_encodings['input_ids'])
x_test = np.array(test_encodings['input_ids'])

y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

# Load FastText embeddings
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = hf_tokenizer.get_vocab()
max_features = min(150000, len(word_index) + 1)
embed_size = 300
embedding_matrix = np.zeros((max_features, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

def build_model(embedding_matrix):
    words = Input(shape=(None,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(LSTM(512, return_sequences=True))(x)
    x = Bidirectional(GRU(512, return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(2048, activation='relu')(hidden)])
    hidden = add([hidden, Dense(2048, activation='relu')(hidden)])
    result = Dense(6, activation='sigmoid')(hidden)
    
    model = Model(inputs=words, outputs=result)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model


tokenizer = text.Tokenizer(filters='')
tokenizer.fit_on_texts(list(train["comment_text"].fillna("fillna").tolist()) + list(test["comment_text"].fillna("fillna").tolist()))

x_train = tokenizer.texts_to_sequences(train["comment_text"].fillna("fillna").tolist())
x_test = tokenizer.texts_to_sequences(test["comment_text"].fillna("fillna").tolist())
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

X_train, X_valid, Y_train, Y_valid = train_test_split(x_train, y_train, test_size=0.1)

# Training loop
SEEDS = 10
pred = 0

for seed in range(SEEDS):
    model = build_model(embedding_matrix)
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}/{EPOCHS}")

        history = model.fit(
                    X_train,
                    Y_train,
                    validation_data=(X_valid, Y_valid),
                    batch_size=128,
                    epochs=1,
                    verbose=2,
                    callbacks=[
                        LearningRateScheduler(lambda _: 1e-3 * (0.5 ** epoch))
                    ]
                )

        train_loss = history.history['loss'][0]
        train_accuracy = history.history['acc'][0]
        val_loss = history.history['val_loss'][0]
        val_accuracy = history.history['val_acc'][0]

        print(f"Training Loss: {train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")
        print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

        train_preds = model.predict(X_train)
        train_auc = np.mean([roc_auc_score(Y_train[:, i], train_preds[:, i]) for i in range(Y_train.shape[1])])
        print(f"Training ROC AUC: {train_auc:.4f}")

        val_preds = model.predict(X_valid)
        val_preds_binary = (val_preds > 0.5).astype(int)

        # Calculate F1 scores, avoiding issues with no positive predictions
        f1_scores = []
        for i in range(Y_valid.shape[1]):
            if np.sum(val_preds_binary[:, i]) == 0:
                f1_scores.append(0.0)
            else:
                f1_scores.append(f1_score(Y_valid[:, i], val_preds_binary[:, i], zero_division=1))
        avg_f1_score = np.mean(f1_scores)

        print(f"Validation F1 Score: {avg_f1_score:.4f}")

        AUC = np.mean([roc_auc_score(Y_valid[:, i], val_preds[:, i]) for i in range(Y_valid.shape[1])])
        print(f"Validation ROC AUC: {AUC:.4f}")

    pred += model.predict(x_test, batch_size=1024, verbose=1) / SEEDS
    np.save('pred', pred)

    model.save_weights(f'model_weights_{seed}.h5')
    os.system(f'gzip model_weights_{seed}.h5')
