In [1]:
import time
import gc
import random
import os

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
import torch.utils.data

from utils import Attention, CyclicLR
from rnn import NeuralNet_2
from rcnn import NeuralNet_3

In [2]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [3]:
embed_size = 300 # how big is each word vector
max_features = 95000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 55 # max number of words in a question to use

batch_size = 512
train_epochs = 3

SEED = 42

In [4]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]


def clean_text(x):
    x = str(x)
    for punct in puncts:
        # allows python to do it faster
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x

In [5]:
def get_meta_features(df, text_column):    
    df['length'] = df[text_column].apply(lambda x: len(x))
    df['num_words'] = df[text_column].apply(lambda x: len(x.split()))
    df['num_unique_words'] = df[text_column].apply(lambda x: len(set(w for w in x.split())))
    df['capitals'] = df[text_column].apply(lambda x: sum(1 for c in x if c.isupper()))
    #df['num_puncts'] = df[text_column].apply(lambda x: sum(x.count(p) for p in puncts if p in x))
    
    #df['num_smilies'] = df[text_column].apply(lambda x: sum(x.count(w) for w in (':-)', ':)', ';-)', ';)')))
    #df['num_sad'] = df[text_column].apply(lambda x: sum(x.count(w) for w in (':-<', ':()', ';-()', ';(')))
    
    df['caps_vs_length'] = df['capitals'] / df['length']
    df['words_vs_unique_words'] = df['num_words'] / df['num_unique_words']
    
    df.fillna(0, inplace=True)
    
    return df[['caps_vs_length', 'words_vs_unique_words']]

In [6]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (56370, 2)


In [7]:
start = time.time()

# Get meta features
train_features = get_meta_features(train_df, 'question_text')
test_features = get_meta_features(test_df, 'question_text')

ss = StandardScaler()
ss.fit(train_features.values)
train_features = ss.transform(train_features)
test_features = ss.transform(test_features)

finish = time.time()
print('Elapsed time:', finish - start)

Elapsed time: 15.297364711761475


In [8]:
# Clean the text
train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_text(x))
test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_text(x))

In [9]:
start = time.time()

## Tokenize the sentences
train_X = train_df["question_text"].values
test_X = test_df["question_text"].values

tokenizer = Tokenizer(num_words=max_features, filters=[])
tokenizer.fit_on_texts(list(train_X) + list(test_X))
word_index = tokenizer.word_index
train_X = tokenizer.texts_to_sequences(train_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values

#shuffling the data
np.random.seed(SEED)
trn_idx = np.random.permutation(len(train_X))

train_X = train_X[trn_idx]
train_y = train_y[trn_idx]

finish = time.time()
print('Elapsed time:', finish - start)

Elapsed time: 50.44481563568115


In [10]:
def load_glove(word_index):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix 

In [11]:
def load_para(word_index):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

In [12]:
start = time.time()

embedding_matrix_1 = load_glove(word_index)
embedding_matrix_2 = load_para(word_index)

embedding_matrix = np.mean([embedding_matrix_1, embedding_matrix_2], axis=0)
print(np.shape(embedding_matrix))

del embedding_matrix_1, embedding_matrix_2
gc.collect()

finish = time.time()
print('Elapsed time:', finish - start)

  


(95000, 300)
Elapsed time: 351.6205518245697


# Training

In [13]:
splits = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED).split(train_X, train_y))

In [14]:
def scoring(y_true, y_proba, verbose=True):
    from sklearn.metrics import roc_curve, precision_recall_curve, f1_score
    from sklearn.model_selection import RepeatedStratifiedKFold

    def threshold_search(y_true, y_proba):
        precision , recall, thresholds = precision_recall_curve(y_true, y_proba)
        thresholds = np.append(thresholds, 1.001)
        with np.errstate(divide='ignore'):
            F = 2 / (1/precision + 1/recall)
        best_score = np.max(F)
        best_th = thresholds[np.argmax(F)]
        return best_th 


    rkf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)

    scores = []
    ths = []
    for train_index, test_index in rkf.split(y_true, y_true):
        y_prob_train, y_prob_test = y_proba[train_index], y_proba[test_index]
        y_true_train, y_true_test = y_true[train_index], y_true[test_index]

        # determine best threshold on 'train' part 
        best_threshold = threshold_search(y_true_train, y_prob_train)

        # use this threshold on 'test' part for score 
        sc = f1_score(y_true_test, (y_prob_test >= best_threshold).astype(int))
        scores.append(sc)
        ths.append(best_threshold)

    best_th = np.mean(ths)
    score = np.mean(scores)

    if verbose: print(f'Best threshold: {np.round(best_th, 4)}, Score: {np.round(score,5)}')

    return best_th, score

In [15]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [16]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self,dataset):
        self.dataset = dataset

    def __getitem__(self, index):
        data, target = self.dataset[index]

        return data, target, index
    def __len__(self):
        return len(self.dataset)

## Training 1st model

In [23]:
train_preds_2 = np.zeros((len(train_X)))
test_preds_2 = np.zeros((len(test_X)))

# always call this before training for deterministic results
seed_torch(SEED)

x_test_cuda = torch.tensor(test_X, dtype=torch.long).cuda()
test = torch.utils.data.TensorDataset(x_test_cuda)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

for i, (train_idx, valid_idx) in enumerate(splits):
    # split data in train / validation according to the KFold indeces
    # also, convert them to a torch tensor and store them on the GPU (done with .cuda())
    features_train_fold = train_features[train_idx]
    x_train_fold = torch.tensor(train_X[train_idx], dtype=torch.long).cuda()
    y_train_fold = torch.tensor(train_y[train_idx, np.newaxis], dtype=torch.float32).cuda()
    
    features_val_fold = train_features[valid_idx]
    x_val_fold = torch.tensor(train_X[valid_idx], dtype=torch.long).cuda()
    y_val_fold = torch.tensor(train_y[valid_idx, np.newaxis], dtype=torch.float32).cuda()
    
    model = NeuralNet_2()
    model.cuda()
    
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction="mean").cuda()
    
    step_size = 300
    base_lr, max_lr = 0.0005, 0.003
    
    optimizer = torch.optim.Adam(model.parameters(), lr=max_lr)
    
    scheduler = CyclicLR(optimizer, base_lr=base_lr, max_lr=max_lr,
                         step_size=step_size, mode='exp_range')
    
    train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
    valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)
    
    train = MyDataset(train)
    valid = MyDataset(valid)
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
    
    print(f'Fold {i + 1}')
    
    for epoch in range(train_epochs):
        start_time = time.time()
        
        model.train()
        avg_loss = 0.
        for i, (x_batch, y_batch, index) in enumerate(train_loader):            
            # Forward pass: compute predicted y by passing x to the model
            f = features_train_fold[index]
            y_pred = model([x_batch, f])
            
            if scheduler:
                scheduler.batch_step()
            
            # Compute and print loss
            loss = loss_fn(y_pred, y_batch)
            
            # Before the backward pass, use the optimizer object to zero all of the
            # gradients for the Tensors it will update (which are the learnable weights of the model)
            optimizer.zero_grad()
            
            # Backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()
            
            # Calling the step function on an Optimizer makes an update to its parameters
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
        
        # set evaluation mode of the model. This disabled operations which are only applied during training like dropout
        model.eval()
        
        valid_preds_fold = np.zeros((x_val_fold.size(0)))
        test_preds_fold = np.zeros(len(test_X))
        avg_val_loss = 0.
        
        for i, (x_batch, y_batch, index) in enumerate(valid_loader):
            f = features_val_fold[index]
            y_pred = model([x_batch, f]).detach()
            avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
            valid_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
        
        val_th, val_f1 = scoring(y_val_fold.cpu().numpy(), valid_preds_fold, verbose=False)
        elapsed_time = time.time() - start_time
        print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t val_f1={:.4f} val_th={:.2f} \t time={:.2f}s'.format(
            epoch + 1, train_epochs, avg_loss, avg_val_loss, val_f1, val_th, elapsed_time))
        
    for i, (x_batch,) in enumerate(test_loader):
        f = test_features[i * batch_size:(i+1) * batch_size]
        y_pred = model([x_batch, f]).detach()

        test_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

    train_preds_2[valid_idx] = valid_preds_fold
    test_preds_2 += test_preds_fold / len(splits)

Fold 1
Epoch 1/3 	 loss=0.1425 	 val_loss=0.1042 	 val_f1=0.6626 val_th=0.42 	 time=144.59s
Epoch 2/3 	 loss=0.1130 	 val_loss=0.0978 	 val_f1=0.6856 val_th=0.32 	 time=144.72s
Epoch 3/3 	 loss=0.1064 	 val_loss=0.0965 	 val_f1=0.6920 val_th=0.29 	 time=144.47s
Fold 2
Epoch 1/3 	 loss=0.1358 	 val_loss=0.1036 	 val_f1=0.6690 val_th=0.24 	 time=144.34s
Epoch 2/3 	 loss=0.1123 	 val_loss=0.0977 	 val_f1=0.6803 val_th=0.31 	 time=144.61s
Epoch 3/3 	 loss=0.1062 	 val_loss=0.0973 	 val_f1=0.6876 val_th=0.29 	 time=144.56s
Fold 3
Epoch 1/3 	 loss=0.1390 	 val_loss=0.1054 	 val_f1=0.6627 val_th=0.30 	 time=144.61s
Epoch 2/3 	 loss=0.1131 	 val_loss=0.1018 	 val_f1=0.6746 val_th=0.29 	 time=144.73s
Epoch 3/3 	 loss=0.1064 	 val_loss=0.0998 	 val_f1=0.6829 val_th=0.29 	 time=144.64s
Fold 4
Epoch 1/3 	 loss=0.1329 	 val_loss=0.1027 	 val_f1=0.6691 val_th=0.29 	 time=144.55s
Epoch 2/3 	 loss=0.1123 	 val_loss=0.0991 	 val_f1=0.6823 val_th=0.27 	 time=144.95s
Epoch 3/3 	 loss=0.1061 	 val_loss=0.

In [24]:
%%time
th_result, sc = scoring(train_y, train_preds_2)

Best threshold: 0.3135, Score: 0.68677
CPU times: user 18.2 s, sys: 4 ms, total: 18.2 s
Wall time: 18.2 s


## Training 3rd model

In [26]:
train_preds_3 = np.zeros((len(train_X)))
test_preds_3 = np.zeros((len(test_X)))

# always call this before training for deterministic results
seed_torch(SEED)

x_test_cuda = torch.tensor(test_X, dtype=torch.long).cuda()
test = torch.utils.data.TensorDataset(x_test_cuda)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

for i, (train_idx, valid_idx) in enumerate(splits):
    # split data in train / validation according to the KFold indeces
    # also, convert them to a torch tensor and store them on the GPU (done with .cuda())
    #features_train_fold = train_features[train_idx]
    x_train_fold = torch.tensor(train_X[train_idx], dtype=torch.long).cuda()
    y_train_fold = torch.tensor(train_y[train_idx, np.newaxis], dtype=torch.float32).cuda()
    
    #features_val_fold = train_features[valid_idx]
    x_val_fold = torch.tensor(train_X[valid_idx], dtype=torch.long).cuda()
    y_val_fold = torch.tensor(train_y[valid_idx, np.newaxis], dtype=torch.float32).cuda()
    
    model = NeuralNet_3()
    model.cuda()
    
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction="mean").cuda()
    
    step_size = 300
    base_lr, max_lr = 0.0005, 0.003
    
    optimizer = torch.optim.Adam(model.parameters(), lr=max_lr)
    
    scheduler = CyclicLR(optimizer, base_lr=base_lr, max_lr=max_lr,
                         step_size=step_size, mode='exp_range')
    
    train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
    valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)
    
    train = MyDataset(train)
    valid = MyDataset(valid)
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
    
    print(f'Fold {i + 1}')
    
    for epoch in range(train_epochs):
        start_time = time.time()
        
        model.train()
        avg_loss = 0.
        for i, (x_batch, y_batch, index) in enumerate(train_loader):            
            # Forward pass: compute predicted y by passing x to the model
            #f = features_train_fold[index]
            #y_pred = model([x_batch, f])
            y_pred = model(x_batch)
            
            if scheduler:
                scheduler.batch_step()
            
            # Compute and print loss
            loss = loss_fn(y_pred, y_batch)
            
            # Before the backward pass, use the optimizer object to zero all of the
            # gradients for the Tensors it will update (which are the learnable weights of the model)
            optimizer.zero_grad()
            
            # Backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()
            
            # Calling the step function on an Optimizer makes an update to its parameters
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
        
        # set evaluation mode of the model. This disabled operations which are only applied during training like dropout
        model.eval()
        
        valid_preds_fold = np.zeros((x_val_fold.size(0)))
        test_preds_fold = np.zeros(len(test_X))
        avg_val_loss = 0.
        
        for i, (x_batch, y_batch, index) in enumerate(valid_loader):
            #f = features_val_fold[index]
            #y_pred = model([x_batch, f]).detach()
            y_pred = model(x_batch).detach()
            avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
            valid_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
        
        val_th, val_f1 = scoring(y_val_fold.cpu().numpy(), valid_preds_fold, verbose=False)
        elapsed_time = time.time() - start_time
        print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t val_f1={:.4f} val_th={:.2f} \t time={:.2f}s'.format(
            epoch + 1, train_epochs, avg_loss, avg_val_loss, val_f1, val_th, elapsed_time))
        
    for i, (x_batch,) in enumerate(test_loader):
        #f = test_features[i * batch_size:(i+1) * batch_size]
        #y_pred = model([x_batch, f]).detach()
        y_pred = model(x_batch).detach()

        test_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

    train_preds_3[valid_idx] = valid_preds_fold
    test_preds_3 += test_preds_fold / len(splits)

Fold 1
Epoch 1/3 	 loss=0.1277 	 val_loss=0.1043 	 val_f1=0.6646 val_th=0.31 	 time=180.83s
Epoch 2/3 	 loss=0.1133 	 val_loss=0.0991 	 val_f1=0.6826 val_th=0.39 	 time=181.43s
Epoch 3/3 	 loss=0.1070 	 val_loss=0.0973 	 val_f1=0.6899 val_th=0.29 	 time=181.43s
Fold 2
Epoch 1/3 	 loss=0.1284 	 val_loss=0.1058 	 val_f1=0.6600 val_th=0.33 	 time=181.05s
Epoch 2/3 	 loss=0.1130 	 val_loss=0.0995 	 val_f1=0.6774 val_th=0.28 	 time=181.39s
Epoch 3/3 	 loss=0.1065 	 val_loss=0.0975 	 val_f1=0.6836 val_th=0.34 	 time=181.36s
Fold 3
Epoch 1/3 	 loss=0.1283 	 val_loss=0.1092 	 val_f1=0.6576 val_th=0.24 	 time=181.34s
Epoch 2/3 	 loss=0.1128 	 val_loss=0.1009 	 val_f1=0.6744 val_th=0.34 	 time=181.44s
Epoch 3/3 	 loss=0.1065 	 val_loss=0.0988 	 val_f1=0.6827 val_th=0.42 	 time=181.45s
Fold 4
Epoch 1/3 	 loss=0.1282 	 val_loss=0.1046 	 val_f1=0.6649 val_th=0.28 	 time=181.28s
Epoch 2/3 	 loss=0.1132 	 val_loss=0.0992 	 val_f1=0.6784 val_th=0.32 	 time=181.79s
Epoch 3/3 	 loss=0.1068 	 val_loss=0.

In [27]:
%%time
th_result, sc = scoring(train_y, train_preds_3)

Best threshold: 0.3345, Score: 0.68259
CPU times: user 18.5 s, sys: 12 ms, total: 18.5 s
Wall time: 18.5 s


## simple blend

In [36]:
%%time
th_result, sc = scoring(train_y, train_preds_3 * 0.5 + train_preds_2 * 0.5)

Best threshold: 0.3322, Score: 0.69315
CPU times: user 18.6 s, sys: 4 ms, total: 18.6 s
Wall time: 18.6 s


## submit

In [37]:
test_preds = test_preds_3 * 0.5 + test_preds_2 * 0.5

In [38]:
sub = pd.read_csv('../input/sample_submission.csv')
sub.prediction = test_preds > th_result
sub.to_csv("submission.csv", index=False)