In [1]:
# Basic prerequisites
import numpy as np
import pandas as pd
import os, re, unidecode, random, math
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Stuff related to model and its training
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [2]:
def data_cleaner(data):
    data = data.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ').replace('. com', '.com')
    
    soup = BeautifulSoup(data, 'html.parser')
    data = soup.get_text(separator=' ')
    
    remove_http = re.sub(r'http\S+', '', data)
    data = re.sub(r"\ [A-Za-z]*\.com", " ", remove_http)
    
    data = unidecode.unidecode(data)
    data = data.lower()
    data = re.sub(r"[^a-zA-Z0-9:$-,()%.?!]+", ' ', data) 
    data = re.sub(r"[:$-,()%.?!]+", ' ',data)
    
    stoplist = stopwords.words("english")
    data = [word for word in word_tokenize(data) if word not in stoplist]
    data = " ".join(data)
    
    return data

def sigmoid(x):
    return 1 / (1 + math.exp(-x))

def seed_everything(SEED = 13):
    np.random.seed(SEED)
    random.seed(SEED)
    tf.random.set_seed(SEED)
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    os.environ['PYTHONHASHSEED'] = str(SEED)
    
seed_everything()
print('seeded everything to get same output')

seeded everything to get same output


In [3]:
train_df = pd.read_csv('../input/toxic-comment/jigsaw-toxic-comment-train.csv')
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
train_df["y"] = (train_df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].sum(axis=1) > 0).astype(int)
train_df.drop(["id","toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"], axis=1, inplace = True)
train_df.head()

Unnamed: 0,comment_text,y
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [5]:
train_df.shape

(223549, 2)

In [6]:
train_df.drop_duplicates(subset=['comment_text'],keep='first',inplace=True)
train_df.shape

(223549, 2)

In [7]:
X = np.array(train_df["comment_text"].values)
X = X.reshape(-1,1)
y = np.array(train_df["y"].values)
rus = RandomUnderSampler(random_state=0)
train, target = rus.fit_resample(X, y)

train = train.flatten()
train_df = pd.DataFrame()
train_df["text"] = train
train_df["target"] = target

# Now its balanced
train_df["target"].value_counts()

0    22468
1    22468
Name: target, dtype: int64

In [8]:
train_df['text'] = [data_cleaner(train_df['text'][i]) for i in tqdm(range(train_df.shape[0]))]
train_df.head()

100%|██████████| 44936/44936 [00:47<00:00, 942.22it/s]


Unnamed: 0,text,target
0,tyrrell head nutcase,0
1,sockpuppet template give comment made laugh pu...,0
2,similar arguments made local cultural traditio...,0
3,invitation take part study wikipedian studying...,0
4,hier hast du du bloder affe,0


In [9]:
from tensorflow.keras import backend as K
def ALReLU(x):
    alpha = 0.01
    return K.minimum(K.abs(alpha * x), x)

from tensorflow.keras.utils import get_custom_objects
get_custom_objects().update({'ALReLU': tf.keras.layers.Activation(ALReLU)})

In [10]:
model = Sequential()
model.add(Embedding(50000, 128, input_length = 300))
model.add(LSTM(512, return_sequences = True))
model.add(Dropout(0.1))
model.add(LSTM(256, return_sequences = True))
model.add(Dropout(0.1))
model.add(LSTM(128, return_sequences = True))
model.add(Dropout(0.1))
model.add(LSTM(64, return_sequences = True))
model.add(Dropout(0.1))
model.add(LSTM(32, return_sequences = False))
model.add(Dropout(0.2))
model.add(Dense(16, activation = ALReLU))
model.add(Dropout(0.25))
model.add(Dense(1, activation = 'sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 128)          6400000   
_________________________________________________________________
lstm (LSTM)                  (None, 300, 512)          1312768   
_________________________________________________________________
dropout (Dropout)            (None, 300, 512)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 300, 256)          787456    
_________________________________________________________________
dropout_1 (Dropout)          (None, 300, 256)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 300, 128)          197120    
_________________________________________________________________
dropout_2 (Dropout)          (None, 300, 128)          0

In [11]:
# Preprocessing
text = train_df['text']
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(text.values)
x_train = tokenizer.texts_to_sequences(text.values)
x_train = pad_sequences(x_train, maxlen=300)
print('generated pad sequences')

generated pad sequences


In [12]:
y_train = train_df['target']
model.compile(
    optimizer=Adam(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=['AUC']
)

lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.25,
    patience=3,
    verbose=0,
    mode='min'
)

chk_point = ModelCheckpoint(
    '/kaggle/working/best_model.h5',
    monitor='val_loss',
    verbose=0,
    save_best_only=True,
    mode='min'
)

es = EarlyStopping(
    patience=5,
    min_delta=0,
    monitor='val_loss',
    restore_best_weights=True,
    verbose=0,
    mode='min',
    baseline=None
)

history = model.fit(
    x_train, y_train,
    validation_split=0.2,
    batch_size=1024,
    epochs = 100,
    callbacks=[es,lr,chk_point],
    shuffle=True,
    verbose=1
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100


In [13]:
test_cur_comp = '../input/jigsaw-toxic-severity-rating/comments_to_score.csv'
test = pd.read_csv(test_cur_comp)
test["text"] = [data_cleaner(test["text"][i]) for i in tqdm(range(test.shape[0]))]
x_test = tokenizer.texts_to_sequences(test["text"].values)
x_test = pad_sequences(x_test, maxlen = 300)
model = load_model('./best_model.h5')
pred = model.predict(x_test)
pred = [sigmoid(x) * 100 for x in pred]

100%|██████████| 7537/7537 [01:02<00:00, 120.96it/s]


In [14]:
final = pd.DataFrame()
final["comment_id"] = test["comment_id"]
final["score"] = pred
final.to_csv("submission.csv", index=False)
final.head()

Unnamed: 0,comment_id,score
0,114890,50.549551
1,732895,53.680058
2,1139051,50.828817
3,1434512,71.383711
4,2084821,71.864304
