# || Overview
- Objective: In this competition, Kagglers will develop models that identify and flag insincere toxic and misleading content (questions).
-  You must predict whether the corresponding question_text is insincere (1) or not (0). 
- We have a seriuos disbalance - only ~6% of data are positive. So, No wonder the metric for the competition is f1-score.
- This is being run as a Kernels Only Competition, requiring that all submissions be made via a Kernel output. 
- This competition does not allow external data.
- Both your training and prediction should fit in a single Kernel.
- CPU Kernel <= 6 hours run-time, GPU Kernel <= 2 hours run-time
- Submission file must be named "submission.csv"
- Following the final submission deadline for the competition, your kernel code will be re-run on a privately-held test set that is not provided to you.
- Stage 2 files will only be available in Kernels and not available for download.
- In **Stage 2**: ( test.csv from 56k  to ~376k rows )

# || References
- 

# || Loading Packages

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline

import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
pd.set_option('max_colwidth', 400)

from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.util import ngrams

import datetime, time, os, re, random, gc
from collections import Counter
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR
# import torch.utils.data


import warnings
warnings.filterwarnings("ignore", message="F-score is ill-defined and being set to 0.0 due to no predicted samples.")
np.seterr(divide='ignore')
t_start = time.time()

Using TensorFlow backend.


In [2]:
!ls ../input/

embeddings  sample_submission.csv  test.csv  train.csv


# || Configuration

In [3]:
embed_size = 300

# || Data overview

In [4]:
path = '../input/'
train = pd.read_csv(os.path.join(path, "train.csv"))
test = pd.read_csv(os.path.join(path, "test.csv"))
sub = pd.read_csv(os.path.join(path, "sample_submission.csv"))

In [5]:
# print('Available Embeddings:', os.listdir(os.path.join(path, "embeddings/")))

In [6]:
# train["target"].value_counts()

In [7]:
# train.head()

In [8]:
# for s, df in zip(["Train", "Test"], [train, test]):
#     print("# Questions in", s)
#     print('\t Average length: {0:.0f}'. format(np.mean(df['question_text'].apply(lambda x: len(x.split())))))
#     print('\t Average char length: {0:.0f}'. format(np.mean(df['question_text'].apply(lambda x: len(x)))))
#     print('\t Max length: {0:.0f}\n'. format(np.max(df['question_text'].apply(lambda x: len(x.split())))))
# # There are quite long questions in train dataset

In [9]:
puncts = {',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '~', '•',
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√'}
mispell_dict = {"aren't" : "are not", "can't" : "cannot", "couldn't" : "could not", "didn't" : "did not", "doesn't" : "does not",
"don't" : "do not", "hadn't" : "had not", "hasn't" : "has not", "haven't" : "have not", "he'd" : "he would", "he'll" : "he will",
"he's" : "he is", "i'd" : "I would", "i'd" : "I had", "i'll" : "I will", "i'm" : "I am", "isn't" : "is not", "it's" : "it is",
"it'll":"it will", "i've" : "I have", "let's" : "let us", "mightn't" : "might not", "mustn't" : "must not", "shan't" : "shall not",
"she'd" : "she would", "she'll" : "she will", "she's" : "she is", "shouldn't" : "should not", "that's" : "that is", "there's" : "there is",
"they'd" : "they would", "they'll" : "they will", "they're" : "they are", "they've" : "they have", "we'd" : "we would", "we're" : "we are",
"weren't" : "were not", "we've" : "we have", "what'll" : "what will", "what're" : "what are", "what's" : "what is", "what've" : "what have",
"where's" : "where is", "who'd" : "who would", "who'll" : "who will", "who're" : "who are", "who's" : "who is", "who've" : "who have",
"won't" : "will not", "wouldn't" : "would not", "you'd" : "you would", "you'll" : "you will", "you're" : "you are", "you've" : "you have",
"'re": " are", "wasn't": "was not", "we'll":" will", "didn't": "did not", "tryin'":"trying"}

########  Clean Text  ########
# Clean speelings
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

def preprocess(x):
    x = str(x).lower()
    x = x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will")
    x = re.sub(r"([0-9]+)000000", r"\1m", x)
    x = re.sub(r"([0-9]+)000", r"\1k", x)
    return x

def clean_text(x):
    x = str(x)
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x

def stemming(x):
    x = x.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in x]
    return  " ".join(stemmed_words)

def clean_numbers(x):
    if bool(re.search(r'\d', x)):
        x = re.sub('[0-9]{2,}', ' ## ', x)
    return x

for df in [train, test]:
    df["question_text"] = df["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
    df["question_text"] = df["question_text"].progress_apply(lambda x: preprocess(x))
    # df["question_text"] = df["question_text"].progress_apply(lambda x: clean_text(x))
    # df["question_text"] = df["question_text"].progress_apply(lambda x: stemming(x))
    # df["question_text"] = df["question_text"].progress_apply(lambda x: clean_numbers(x))

max_features = 120000
# tk = Tokenizer(lower = True, filters = ''.join(puncts), num_words=max_features)
tk = Tokenizer(lower = True, num_words=max_features)
full_text = list(train['question_text'].values) + list(test['question_text'].values)
tk.fit_on_texts(full_text)

word_index = tk.word_index

100%|██████████| 1306122/1306122 [00:12<00:00, 107417.78it/s]
100%|██████████| 1306122/1306122 [00:20<00:00, 63914.89it/s]
100%|██████████| 375806/375806 [00:03<00:00, 108102.46it/s]
100%|██████████| 375806/375806 [00:05<00:00, 64276.43it/s]


In [10]:
# len(word_index) # 200627  # 158007 #-> Contains number: 14980
# t = []
# for w in word_index.keys(): 
#     if bool(re.search(r'\d', w)): t.append(w)
# len(t)   

In [11]:
train_tokenizerd = tk.texts_to_sequences(train['question_text'].fillna('missing'))
test_tokenizerd = tk.texts_to_sequences(test['question_text'].fillna('missing'))

In [12]:
# train['question_text'].apply(lambda x: len(x.split())).plot(kind='hist'); plt.yscale('log');
# plt.title('Distribution of question text length in characters');

In [13]:
# Let's try having sequence length equal to 70 for now
max_len = 72
X_train = pad_sequences(train_tokenizerd, max_len)
X_test = pad_sequences(test_tokenizerd, max_len)

def sigmoid(x): return 1 / (1 + np.exp(-x))

In [14]:
## Preparing Data for pytorch:
# One of main differences from Keras is preparing data. 
# Pytorch requires special dataloaders. I'll write a class for it

y_train = train['target'].values
splits = list(StratifiedKFold(n_splits=4, shuffle=True, random_state=10).split(X_train, y_train))

# || Load Embeddings

In [15]:
## GloVe: Global Vectors for Word Representation
# Form : "Word 300_float32 "
# *arr for variable number of arrays

# load_glove_fast
def load_glove(word_index, max_words=200000, embed_size=300):
    s = ""
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    emb_mean, emb_std = -0.005838499, 0.48782197

    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_words, embed_size))
    with open(EMBEDDING_FILE, 'r', encoding="utf8") as f:
        for line in f:
            word, vec = line.split(' ', 1)
            if word not in word_index:
                continue
            i = word_index[word]
            if i >= max_words:
                continue
            embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:300]
            if len(embedding_vector) == 300:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix, 

def load_para(word_index, max_words=200000, embed_size=300):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'

    emb_mean, emb_std = -0.0053247833, 0.49346462

    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_words, embed_size))
    with open(EMBEDDING_FILE, 'r', encoding="latin-1") as f:
        for line in f:
            word, vec = line.split(' ', 1)
            if word not in word_index:
                continue
            i = word_index[word]
            if i >= max_words:
                continue
            embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:embed_size]
            if len(embedding_vector) == embed_size:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

def load_fasttext(word_index):    
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [16]:
%%time
embedding_matrix_1 = load_glove(word_index, max_words=max_features)
embedding_matrix_2 = load_para(word_index, max_words=max_features)
#embedding_matrix_3 = load_fasttext(word_index)

CPU times: user 33.5 s, sys: 5.96 s, total: 39.5 s
Wall time: 40.2 s


In [17]:
# np.save("word_index.npy",word_index)
# word_index = np.load("word_index.npy").item()
np.shape(embedding_matrix_1[0])

(120000, 300)

In [18]:
### Simple average: http://aclweb.org/anthology/N18-2031 
embedding_matrix = np.mean([embedding_matrix_1, embedding_matrix_2], axis = 0)[0]

del embedding_matrix_1, embedding_matrix_2
gc.collect()
np.shape(embedding_matrix)

(120000, 300)

# || Model

In [19]:

class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True
        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias: self.b = nn.Parameter(torch.zeros(step_dim))
        
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias: eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None: a = a * mask

        a = a / torch.sum(a, 1, keepdim=True) + 1e-10
        weighted_input = x * torch.unsqueeze(a, -1)
        
        return torch.sum(weighted_input, 1)
    
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        
        hidden_size = 128
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        
        self.embedding_dropout = nn.Dropout2d(0.1)
        self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(hidden_size*2, hidden_size, bidirectional=True, batch_first=True)
        
        self.lstm_attention = Attention(hidden_size*2, max_len)
        self.gru_attention = Attention(hidden_size*2, max_len)
        
        self.linear = nn.Linear(1536, 256)
        self.linear1 = nn.Linear(256, 32)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.out = nn.Linear(32, 1)
        
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = torch.squeeze(self.embedding_dropout(torch.unsqueeze(h_embedding, 0)))
        
        h_lstm, _ = self.lstm(h_embedding)
        h_gru, _ = self.gru(h_lstm)
        
        h_lstm_atten = self.lstm_attention(h_lstm)
        h_gru_atten = self.gru_attention(h_gru)
        
        avg_pool_g = torch.mean(h_gru, 1)
        max_pool_g, _ = torch.max(h_gru, 1)
        
        avg_pool_l = torch.mean(h_lstm, 1)
        max_pool_l, _ = torch.max(h_lstm, 1)
        
        conc = torch.cat((h_lstm_atten, h_gru_atten, avg_pool_g, max_pool_g, avg_pool_l, max_pool_l), 1)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        conc = self.relu(self.linear1(conc))
        out = self.out(conc)
        
        return out


In [20]:
m = NeuralNet()
x_test_cuda = torch.tensor(X_test, dtype=torch.long).cuda()
test = torch.utils.data.TensorDataset(x_test_cuda)
batch_size = 512
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
seed=1029
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed)

# || Training

In [21]:
def train_model_full(X_train=X_train, y_train=y_train, splits=splits, n_epochs=5, batch_size=batch_size, validate=False):
    train_preds = np.zeros(len(X_train))
    test_preds = np.zeros((len(test), len(splits)))
    scores = []
    for i, (train_idx, valid_idx) in enumerate(splits):
        print(f'Fold {i + 1}. {time.ctime()}')
        x_train_fold = torch.tensor(X_train[train_idx], dtype=torch.long).cuda()
        y_train_fold = torch.tensor(y_train[train_idx, np.newaxis], dtype=torch.float32).cuda()
        x_val_fold = torch.tensor(X_train[valid_idx], dtype=torch.long).cuda()
        y_val_fold = torch.tensor(y_train[valid_idx, np.newaxis], dtype=torch.float32).cuda()
        
        seed_everything(seed + i)
        model = NeuralNet()
        model.cuda()
        optimizer = torch.optim.Adam(model.parameters())
        # scheduler = StepLR(optimizer, step_size=3, gamma=0.1)
        
        loss_fn = torch.nn.BCEWithLogitsLoss(reduction='mean').cuda()
        
        train_dataset = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
        valid_dataset = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)

        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
        
        best_f1 = 0
        best_model_name = ''
        
        for epoch in range(n_epochs):
            print()
            print(f'Epoch {epoch}. {time.ctime()}')
            model.train()
            avg_loss = 0.

            for x_batch, y_batch in train_loader:
                # print(x_batch.shape)
                y_pred = model(x_batch)
                loss = loss_fn(y_pred, y_batch)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                avg_loss += loss.item() / len(train_loader)

            model.eval()

            valid_preds = np.zeros((x_val_fold.size(0)))

            if validate:
                avg_val_loss = 0.
                for j, (x_batch, y_batch) in enumerate(valid_loader):
                    y_pred = model(x_batch).detach()

                    avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
                    valid_preds[j * batch_size:(j+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

                best_th, score = scoring(y_val_fold.cpu().numpy(), valid_preds, verbose=True)

#                 if score > best_f1:
#                     best_f1 = score
#                     torch.save(model.state_dict(), f'model_{epoch}.pt')
#                     best_model_name = f'model_{epoch}.pt'
#                 else:
#                     print('Stopping training on this fold')
#                     break
        
#         if score < best_f1:
#             checkpoint = torch.load(best_model_name)
#             model.load_state_dict(checkpoint)
#             model.eval()

        valid_preds = np.zeros((x_val_fold.size(0)))

        avg_val_loss = 0.
        for j, (x_batch, y_batch) in enumerate(valid_loader):
            y_pred = model(x_batch).detach()

            avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
            valid_preds[j * batch_size:(j+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
        best_th, score = scoring(y_val_fold.cpu().numpy(), valid_preds, verbose=True)

        scores.append(score)

        test_preds_fold = np.zeros((len(test_loader.dataset)))

        for j, (x_batch,) in enumerate(test_loader):
            y_pred = model(x_batch).detach()

            test_preds_fold[j * batch_size:(j+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

        train_preds[valid_idx] = valid_preds
        test_preds[:, i] = test_preds_fold
    print(f'Finished training at {time.ctime()}')
    print(f'Mean validation f1-score: {np.mean(scores)}. Std: {np.std(scores)}')
    
    return train_preds, test_preds

# || Searching for optimal threshold

In [22]:
def scoring(y_true, y_proba, verbose=True):
    # https://www.kaggle.com/c/quora-insincere-questions-classification/discussion/76391
    
    def threshold_search1(y_true, y_proba):
        precision , recall, thresholds = precision_recall_curve(y_true, y_proba)
        thresholds = np.append(thresholds, 1.001) 
        F = 2 / (1/precision + 1/recall)
        best_score = np.max(F)
        best_th = thresholds[np.argmax(F)]
        return best_th 

    # rkf = RepeatedStratifiedKFold(n_splits=4, n_repeats=10)
    rkf = StratifiedKFold(n_splits=4)

    scores = []
    ths = []
    for train_index, test_index in rkf.split(y_true, y_true):
        y_prob_train, y_prob_test = y_proba[train_index], y_proba[test_index]
        y_true_train, y_true_test = y_true[train_index], y_true[test_index]

        # determine best threshold on 'train' part 
        best_threshold = threshold_search1(y_true_train, y_prob_train)

        # use this threshold on 'test' part for score 
        sc = f1_score(y_true_test, (y_prob_test >= best_threshold).astype(int))
        scores.append(sc)
        ths.append(best_threshold)

    best_th = np.mean(ths)
    score = np.mean(scores)

    if verbose: print(f'Best threshold: {np.round(best_th, 4)}, Score: {np.round(score,5)}')

    return best_th, score

# || Training

In [23]:
train_preds, test_preds = train_model_full(X_train=X_train, y_train=y_train, splits=splits, n_epochs=5, batch_size=batch_size, validate=True)

Fold 1. Mon Feb 11 12:29:59 2019

Epoch 0. Mon Feb 11 12:30:01 2019
Best threshold: 0.325, Score: 0.66123

Epoch 1. Mon Feb 11 12:35:16 2019
Best threshold: 0.3348, Score: 0.68037

Epoch 2. Mon Feb 11 12:40:31 2019
Best threshold: 0.2672, Score: 0.68415

Epoch 3. Mon Feb 11 12:45:45 2019
Best threshold: 0.2751, Score: 0.67929

Epoch 4. Mon Feb 11 12:50:59 2019
Best threshold: 0.4261, Score: 0.67654
Best threshold: 0.4261, Score: 0.67654
Fold 2. Mon Feb 11 12:57:16 2019

Epoch 0. Mon Feb 11 12:57:18 2019
Best threshold: 0.2546, Score: 0.66098

Epoch 1. Mon Feb 11 13:02:30 2019
Best threshold: 0.3037, Score: 0.67513

Epoch 2. Mon Feb 11 13:07:43 2019
Best threshold: 0.3081, Score: 0.68489

Epoch 3. Mon Feb 11 13:12:54 2019
Best threshold: 0.3229, Score: 0.68437

Epoch 4. Mon Feb 11 13:18:06 2019
Best threshold: 0.3966, Score: 0.68083
Best threshold: 0.3966, Score: 0.68083
Fold 3. Mon Feb 11 13:24:22 2019

Epoch 0. Mon Feb 11 13:24:24 2019
Best threshold: 0.2383, Score: 0.6607

Epoch 1. M

In [24]:
best_th, score = scoring(y_train, train_preds)
sub['prediction'] = (test_preds.mean(1) > best_th).astype(int)
sub.to_csv("submission.csv", index=False)

Best threshold: 0.333, Score: 0.67296


In [25]:
t_finish = time.time()
print(f"Kernel run time = {(t_finish-t_start)/3600} hours")

Kernel run time = 1.8620194762945175 hours
