In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *

In [3]:
data_path = Path("../data/quora/")

In [4]:
data_path.ls()

[PosixPath('../data/quora/train.csv'),
 PosixPath('../data/quora/sample_submission.csv'),
 PosixPath('../data/quora/embeddings'),
 PosixPath('../data/quora/test.csv')]

In [5]:
train_df = pd.read_csv(data_path/"train.csv")
test_df = pd.read_csv(data_path/"test.csv")

In [6]:
train_df.head(2)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0


In [7]:
test_df.head(2)

Unnamed: 0,qid,question_text
0,00014894849d00ba98a9,My voice range is A2-C5. My chest voice goes u...
1,000156468431f09b3cae,How much does a tutor earn in Bangalore?


In [8]:
list((data_path/'embeddings').glob("*/**"))

[PosixPath('../data/quora/embeddings/paragram_300_sl999'),
 PosixPath('../data/quora/embeddings/GoogleNews-vectors-negative300'),
 PosixPath('../data/quora/embeddings/wiki-news-300d-1M'),
 PosixPath('../data/quora/embeddings/glove.840B.300d')]

### Classification Model

In [9]:
class fbeta_binary(Callback):
    "Computes the f_beta between preds and targets for binary text classification"

    def __init__(self, beta2 = 1, eps=1e-9, sigmoid = True):      
        self.beta2=beta2**2
        self.eps = eps
        self.sigmoid = sigmoid
    
    def on_epoch_begin(self, **kwargs):
        self.TP = 0
        self.total_y_pred = 0   
        self.total_y_true = 0
    
    def on_batch_end(self, last_output, last_target, **kwargs):
        y_pred = last_output
        y_pred = y_pred.softmax(dim = 1)        
        y_pred = y_pred.argmax(dim=1)
        y_true = last_target.float()
        
        self.TP += ((y_pred==1) * (y_true==1)).float().sum()
        self.total_y_pred += (y_pred==1).float().sum()
        self.total_y_true += (y_true==1).float().sum()
    
    def on_epoch_end(self, **kwargs):
        prec = self.TP/(self.total_y_pred+self.eps)
        rec = self.TP/(self.total_y_true+self.eps)
        res = (prec*rec)/(prec*self.beta2+rec+self.eps)*(1+self.beta2)        
        self.metric = res     

In [10]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=0, eps=1e-7, c=2):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.eps = eps
        self.c = c
        
    def forward(self, input, target):       
        mask = torch.eye(self.c)[target]
        probas = torch.softmax(input, dim=1)
        probas = probas.clamp(self.eps, 1. - self.eps)
        pt = torch.masked_select(probas, mask.byte().cuda())
        return torch.mean(-torch.pow((1-pt), self.gamma)*torch.log(pt))

### Load Embeddings

In [11]:
glove_file = '../data/quora/embeddings/glove.840B.300d/glove.840B.300d.txt'
wiki_file = '../data/quora/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
paragram_file = '../data/quora/embeddings/paragram_300_sl999/paragram_300_sl999.txt'

In [12]:
def load_glove():
    EMBEDDING_FILE = '../data/quora/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.005838499,0.48782197
    return embeddings_index,emb_mean,emb_std
    
def load_fasttext():    
    EMBEDDING_FILE = '../data/quora/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    return embeddings_index,emb_mean,emb_std

def load_para():
    EMBEDDING_FILE = '../data/quora/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.0053247833,0.49346462
    return embeddings_index,emb_mean,emb_std

In [13]:
%%time
emb1, emb1_mean, emb1_std  = load_glove()
emb2, emb2_mean, emb2_std = load_fasttext()
emb3, emb3_mean, emb3_std = load_para()

CPU times: user 5min 55s, sys: 10.8 s, total: 6min 6s
Wall time: 6min 10s


### Training

In [20]:
test_df_large = pd.concat([test_df]*7).reset_index(drop=True)

In [14]:
from sklearn.model_selection import StratifiedKFold
n_splits = 5
kfold = StratifiedKFold(n_splits=n_splits)
splits = kfold.split(train_df, train_df['target'])
trn_val_idxs = [idxs for idxs in splits]

In [22]:
%%time
for (trn_idxs, val_idxs) in trn_val_idxs:
    break

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.68 µs


In [23]:
%%time
trn_df = train_df.iloc[trn_idxs].reset_index(drop=True)
val_df = train_df.iloc[val_idxs].reset_index(drop=True)

data_clas = TextClasDataBunch.from_df(".", train_df=trn_df, valid_df=val_df, test_df=test_df_large,
                                      text_cols="question_text", label_cols="target", bs=512)

CPU times: user 59.9 s, sys: 2min 32s, total: 3min 32s
Wall time: 7min 42s


In [26]:
%%time
#create learner
learn = text_classifier_learner(data_clas, emb_sz=900, drop_mult=0.5, nh=1150, nl=3, pad_token=1, 
                                qrnn=False, max_len=70*20, lin_ftrs=[1024], ps=[0.5])
fbeta = fbeta_binary() 
learn.metrics = [accuracy, fbeta]
focal_loss = FocalLoss(gamma=2)
learn.loss_func = focal_loss

# concat and copy (glove, fasttext, paragram)
for i, w in enumerate(learn.data.train_ds.x.vocab.itos):
    if (w in emb1):
        vec1 = emb1[w]
    else: 
        vec1 = np.random.normal(emb1_mean, emb1_std, size=(300,))
    if (w in emb2):
        vec2 = emb2[w]
    else: 
        vec2 = np.random.normal(emb2_mean, emb2_std, size=(300,))
    if (w in emb3):
        vec3 = emb3[w]
    else: 
        vec3 = np.random.normal(emb3_mean, emb3_std, size=(300,))
    vec_cat = torch.from_numpy(np.concatenate((vec1, vec2, vec3)))
    learn.model[0].encoder.weight.data[i].copy_(vec_cat)

CPU times: user 11 s, sys: 1.77 s, total: 12.8 s
Wall time: 12.8 s


In [28]:
%%time
# training
learn.freeze_to(1)
learn.fit_one_cycle(1, 1e-2)
learn.unfreeze()
learn.fit_one_cycle(1, slice(5e-3))
learn.fit_one_cycle(1, 1e-3)
learn.fit_one_cycle(1, 1e-4)

epoch,train_loss,valid_loss,accuracy,fbeta_binary
1,0.033615,0.030937,0.955039,0.540619


epoch,train_loss,valid_loss,accuracy,fbeta_binary
1,0.032484,0.028710,0.958216,0.611828


epoch,train_loss,valid_loss,accuracy,fbeta_binary
1,0.030529,0.028491,0.958561,0.609107


epoch,train_loss,valid_loss,accuracy,fbeta_binary
1,0.025373,0.028267,0.958986,0.622534


CPU times: user 26min 54s, sys: 9min 34s, total: 36min 29s
Wall time: 34min 58s


### Tune Threshold 

Since `SortSampler` is used validation and test data is also doesn't have the original order

In [29]:
from sklearn.metrics import f1_score

lengths = [len(t) for t in learn.data.valid_ds.x.items]
sampler = SortSampler(range(len(learn.data.valid_ds.x)), key=lengths.__getitem__)
idxs = [i for i in sampler]

pred_out = learn.get_preds(ds_type=DatasetType.Valid)

preds = to_np(torch.softmax(pred_out[0], dim=1)[:, 1])

sorted_preds = preds[np.argsort(idxs)]

sorted_preds.min(), sorted_preds.max()

actual = learn.data.valid_ds.y.items

best_score, best_thresh = 0,0
for t in np.linspace(0.01, 0.99, 50):
    score = f1_score(actual, sorted_preds > t)
    if score > best_score:
        best_score = score
        best_thresh = t
    print(score, t)
print()
print(best_score, best_thresh)

0.1165303348751023 0.01
0.1165303348751023 0.03
0.116564372930986 0.05
0.11836352051092376 0.06999999999999999
0.16038642531683392 0.09
0.20953327362234322 0.11
0.26626500983222456 0.13
0.32022426261477205 0.15000000000000002
0.3661459372542576 0.17
0.4084149245832231 0.19
0.4466780138513911 0.21000000000000002
0.48566108007448794 0.23
0.5196395889060961 0.25
0.5475463015352118 0.27
0.5699689472102575 0.29000000000000004
0.5904737689634714 0.31
0.6123165571261366 0.33
0.6304834484401048 0.35000000000000003
0.6432148222333499 0.37
0.652303508863672 0.39
0.658919169599258 0.41000000000000003
0.6632174708438143 0.43
0.6605576963922687 0.45
0.6507244973698647 0.47000000000000003
0.6328828058169376 0.49
0.6088030214991283 0.51
0.5722016079158937 0.53
0.5247214197276104 0.55
0.46147708130153375 0.5700000000000001
0.3795480658751436 0.59
0.29495591552183714 0.61
0.20791268758526602 0.63
0.1397867852459951 0.65
0.086889974686525 0.67
0.05448544854485448 0.6900000000000001
0.03127091318367099 0

  'precision', 'predicted', average, warn_for)


### Prepare Submission

Since `SortSampler` is used validation and test data is also doesn't have the original order

In [30]:
lengths = [len(t) for t in learn.data.test_ds.x.items]
sampler = SortSampler(range(len(learn.data.test_ds.x)), key=lengths.__getitem__)
idxs = [i for i in sampler]

In [37]:
pred_out = learn.get_preds(ds_type=DatasetType.Test)
preds = to_np(torch.softmax(pred_out[0], dim=1)[:, 1] > 0.41)
sorted_preds = preds[np.argsort(idxs)]
pred_str = ''.join([str(i) for i in sorted_preds.tolist()])

In [38]:
pred_str[:len(test_df)]

'000000000000000000000000000000000000000100000010000000100000000000000000000001000000001000000000001000110100000010000100000100000000101000000000000001000000000000001010000100100000000000000000000000100000000000100000000100001001000000000100000001000111000000001000000000000000000000000000001000010100010000000000000000000001000000000000000100000100000000000000000000000000000000001000000000001000000000100000000001000100000000000001010000000010000000010000000000000100000000000000000000000000000000000000000000010000000000000000110000000000000000001000000010010000000000000010001000000001000000000000000000000000010000000000000000000000000000100000000000000000000000000000100000000000000000000000000000001000000000000000000000001001000010000000000010000100000000000000000000100100001000000000000000000000000010000000000000000000000000000000000000000000100000001000010100000000000000000000000000000000000000000000000100000000000000000010000000000000001000000000000100000000000000000000001000000000000

### CV-LB

In [66]:
CV_LB_T = [0.6604046242774567, 0.654, 0.45]