In [None]:
import numpy as np
import pandas as pd
import os
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, CuDNNGRU, Conv1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
print(tf.__version__)
tf.test.is_gpu_available(
    cuda_only=False,
    min_cuda_compute_capability=None
)

In [None]:
import re

# 停用词列表，可以根据需要进行扩展
stopwords = ["the", "and", "is", "on", "in", "if", "for", "a", "an", "of", "or", "to", "it", "you", "your"]

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    text = text.replace('\n', '')

    # Remove web links
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    # Remove special characters, punctuation marks, and newlines
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra white spaces
    text = re.sub(r'\s+', ' ', text)

    # Remove stopwords
    text = ' '.join(word for word in text.split() if word.lower() not in stopwords)

    return text.lower()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#import plotly.express as px
# import plotly.offline as pyo
# import plotly.graph_objects as go
import re
from wordcloud import WordCloud, STOPWORDS

import os
import zipfile
import warnings
import random
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
# Helper function to perform synonym replacement
stopwords = ["the", "and", "is", "on", "in", "if", "for", "a", "an", "of", "or", "to", "it", "you", "your"]
def synonym_replacement(text, n=5):
    words = text.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stopwords]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    new_text = ' '.join(new_words)
    return new_text

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char.isalpha()])
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

# Helper function to perform random insertion
def random_insertion(text, n=3):
    words = text.split()
    new_words = words.copy()
    for _ in range(n):
        word_to_insert = get_random_word(text)
        random_index = random.randint(0, len(new_words))
        new_words.insert(random_index, word_to_insert)
    new_text = ' '.join(new_words)
    return new_text

def get_random_word(word_source):
    # Replace with your method to get random words
    word = random.choice(word_source) if word_source else None
    if word:
        synonyms = get_synonyms(word)
        if synonyms:
            return random.choice(synonyms)
    return word  # Return the original word if no synonyms are found

# Helper function to perform random deletion
def random_deletion(text, p=0.2):
    words = text.split()
    if len(words) == 1:
        return text
    new_words = [word for word in words if random.uniform(0, 1) > p]
    if len(new_words) == 0:
        return random.choice(words)
    new_text = ' '.join(new_words)
    return new_text

In [None]:
EMBEDDING_FILES = [
        '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec',
    '../input/glove840b300dtxt/glove.840B.300d.txt'
]

BATCH_SIZE = 512
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
MAX_LEN = 220


TEXT_COLUMN = 'comment_text'
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

In [None]:
train_df = pd.read_csv('/kaggle/input/d/julian3833/jigsaw-toxic-comment-classification-challenge/train.csv')
train_df 

In [None]:
#先拆分
X_train, X_valid_test, Y_train, Y_valid_test = train_test_split(train_df, train_df , test_size = 0.2, random_state=42)
X_test, X_valid,Y_test,Y_valid = train_test_split(X_valid_test, Y_valid_test , test_size = 0.5, random_state=42)

In [None]:
#分割出來後準備放入wordnet
X_train['total'] =X_train [['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)
X_toxic_train = X_train [X_train['total'] != 0] #大概1.3萬筆

In [None]:
y = X_train[list_classes].values #label
X_train = X_train[TEXT_COLUMN].astype(str)  #str
Y_train = y
#---------------------------
y = X_toxic_train[list_classes].values #label
X_toxic_train = X_toxic_train[TEXT_COLUMN].astype(str)  #str
Y_toxic_train = y
#---------------------------
y = X_test[list_classes].values #label
X_test = X_test[TEXT_COLUMN].astype(str)  #str
Y_test = y
#----------------------------
y = X_valid[list_classes].values #label
X_valid = X_valid[TEXT_COLUMN].astype(str)  #str
Y_valid = y

In [None]:
#同義詞替換
for index in X_toxic_train.index:
    X_toxic_train[index]=synonym_replacement(X_toxic_train[index], n=3)
X_train = pd.concat([X_toxic_train, X_train])
Y_train = np.concatenate([Y_toxic_train, Y_train], axis=0)

In [None]:
#同義詞替換
for index in X_toxic_train.index:
    X_toxic_train[index]=synonym_replacement(X_toxic_train[index], n=3)
X_train = pd.concat([X_toxic_train, X_train])
Y_train = np.concatenate([Y_toxic_train, Y_train], axis=0)

In [None]:
#隨機插入
# for index in X_toxic_train.index:
#     X_toxic_train[index]=random_insertion(X_toxic_train[index], n=3)
# X_train = pd.concat([X_toxic_train, X_train])
# Y_train = np.concatenate([Y_toxic_train, Y_train], axis=0)

In [None]:
#隨機插入
# for index in X_toxic_train.index:
#     X_toxic_train[index]=random_deletion(X_toxic_train[index], p=0.2)
# X_train = pd.concat([X_toxic_train, X_train])
# Y_train = np.concatenate([Y_toxic_train, Y_train], axis=0)

In [None]:
# #進一步清理文本data
# for index in X_train.index:
#     X_train[index]=clean_text(X_train[index])
# for index in X_test.index:
#     X_test[index]=clean_text(X_test[index])
# for index in X_valid.index:
#     X_valid[index]=clean_text(X_valid[index])

In [None]:
import tensorflow as tf
from tensorflow.keras import backend as K

# def focal_loss(gamma=3., alpha=0.1): #舊的，這個真的可以
#     def focal_loss_fixed(y_true, y_pred):
#         pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
#         pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))

#         epsilon = K.epsilon()
#         pt_1 = K.clip(pt_1, epsilon, 1. - epsilon)
#         pt_0 = K.clip(pt_0, epsilon, 1. - epsilon)
#         print("y_true: ",y_true)
#         print("y_pred: ",y_pred)
#         return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.sum((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))

#     return focal_loss_fixed
import tensorflow as tf
from tensorflow.keras import backend as K

def focal_loss(alpha=0.75, gamma=4):
    def focal_loss_fixed(y_true, y_pred):
        epsilon = K.epsilon()
        y_pred = K.clip(y_pred, epsilon, 1. - epsilon)

        focal = -alpha * K.pow(1. - y_pred, gamma) * K.log(y_pred)
        non_focal = -(1. - alpha) * K.pow(y_pred, gamma) * K.log(1. - y_pred)

        loss = y_true * focal + (1. - y_true) * non_focal

        return K.sum(loss)
    return focal_loss_fixed

In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)


def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix

def build_model(embedding_matrix):
    words = Input(shape=(None,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    #x = SpatialDropout1D(0.2)(x)

    x1 = SpatialDropout1D(0.2)(x)

    x = Bidirectional(CuDNNGRU(LSTM_UNITS, return_sequences = True))(x1)
    x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    
    y = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences = True))(x1)
    y = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(y)

    avg_pool1 = GlobalAveragePooling1D()(x)
    max_pool1 = GlobalMaxPooling1D()(x)
   
    avg_pool2 = GlobalAveragePooling1D()(y)
    max_pool2 = GlobalMaxPooling1D()(y)
   
    x = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2])

    x = Dense(6, activation = "sigmoid")(x)

    model = Model(inputs = words, outputs = x)

    model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])
#     model.compile(loss=focal_loss(), optimizer="adam", metrics=["accuracy"])

    return model

In [None]:
%%time
tokenizer = text.Tokenizer(filters=CHARS_TO_REMOVE)
tokenizer.fit_on_texts(list(X_train)+list(X_test)+list(X_valid))

X_train = tokenizer.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(X_train, maxlen=MAX_LEN)
X_test = tokenizer.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(X_test, maxlen=MAX_LEN)
X_valid= tokenizer.texts_to_sequences(X_valid)
X_valid = sequence.pad_sequences(X_valid, maxlen=MAX_LEN)

In [None]:
%%time
embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score
from keras.callbacks import LearningRateScheduler
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score

EPOCHS = 4
SEEDS = 4

overall_accuracy = 0
all_paramater=[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]  #2D-list
for ii in range(SEEDS):
    model = build_model(embedding_matrix)
    for global_epoch in range(EPOCHS):
        print(global_epoch)
        model.fit(
            X_train,
            Y_train,
            validation_data=(X_valid, Y_valid),
            batch_size=128,
            epochs=1,
            verbose=2,
            callbacks=[
                LearningRateScheduler(lambda _: 1e-3 * (0.55 ** global_epoch))
            ]
        )
        val_preds = model.predict(X_valid)
        AUC = 0
        for i in range(6):
            AUC += roc_auc_score(Y_valid[:, i], val_preds[:, i]) / 6. 
        print("Validation AUC:", AUC)
    
    test_preds=model.predict(X_test)
    for i in range(6):
        
        y_true = Y_test[:, i]
        y_pred = (test_preds[:, i] > 0.5).astype(int)
    
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

        # 計算 true negative rate
        tnr = tn / (tn + fp)
        all_paramater[i][0]+=tnr
        
        # 計算 precision
        precision = tp / (tp + fp)
        all_paramater[i][1]+=precision 
        
        # 計算 recall
        recall = tp / (tp + fn)
        all_paramater[i][2]+=recall
        
        # 計算特異度（Specificity）
        specificity = tn / (tn + fp)
        all_paramater[i][3]+=specificity 

        # 計算 F1 分數
        f1 = f1_score(y_true, y_pred)
        all_paramater[i][4]+=f1 
        
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        all_paramater[i][5]+=accuracy 
        
        print(f"Class {i + 1} - True Negative Rate: {tnr}, Precision: {precision}, Recall: {recall}, Specificity: {specificity}, F1 Score: {f1}, Accuracy: {accuracy}")

print("\n\n")
for i in range(6):
    print(f"AVG: Class {i +1} - True Negative Rate:{all_paramater[i][0]/SEEDS},Precision:{all_paramater[i][1]/SEEDS},Recall:{all_paramater[i][2]/SEEDS},Specificity:{all_paramater[i][3]/SEEDS},F1 Score:{all_paramater[i][4]/SEEDS},Accuracy:{all_paramater[i][5]/SEEDS}")
