In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.text import *
from fastai import *

In [3]:
from pathlib import Path

In [4]:
torch.cuda.set_device(0)

In [5]:
DATA_PATH = Path('data/comments/')
TEST_CSV = DATA_PATH/'test.csv'
TRAIN_CSV = DATA_PATH/'train.csv'
TEST_LABELS_CSV = DATA_PATH/'test_labels.csv'

## Loading Data

In [6]:
chunksize = 24000

In [7]:
CLAS_PATH=Path('data/comment_clas/')
CLAS_PATH.mkdir(exist_ok=True)

LM_PATH=Path('data/comment_lm/')
LM_PATH.mkdir(exist_ok=True)

In [8]:
test = pd.read_csv(TEST_CSV)
test_labels = pd.read_csv(TEST_LABELS_CSV)
train = pd.read_csv(TRAIN_CSV, chunksize=chunksize, header='infer')

In [9]:
test_combined = pd.merge(
    test, test_labels
)

In [10]:
test_combined.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,":If you have a look back at the source, the in...",-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,I don't anonymously edit articles at all.,-1,-1,-1,-1,-1,-1


In [11]:
# input - df: a Dataframe, chunkSize: the chunk size
# output - a list of DataFrame
# purpose - splits the DataFrame into smaller of max size chunkSize (last is smaller)
def split_to_chunks(df, chunkSize): 
    listOfDf = list()
    numberChunks = len(df) // chunkSize + 1
    for i in range(numberChunks):
        listOfDf.append(df[i*chunkSize:(i+1)*chunkSize])
    return listOfDf

In [12]:
test_combined = split_to_chunks(test_combined, chunksize)

In [13]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

## Configuring Training Data

In [14]:
rel = re.compile(r'  +')
def fixup(x: str):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return rel.sub(' ', x)

In [15]:
BOS = 'xbos'
FLD = 'xfld'

In [16]:
def get_texts(row):
    labels = row[list_classes].values
    texts = f'\n{BOS} {FLD} 1 ' + row['comment_text'].astype(str)
    texts = texts.apply(fixup).values.astype(str)
    tok = Tokenizer().proc_all_mp(partition_by_cores(texts))
    return list(tok), list(labels)

In [17]:
def tokenize_texts(df):
    full_text = []
    labels = []
    print(f"TOKENIZING comments")
    total = 0
    for i, row in enumerate(df):
        print(total)
        tok_, labels_ = get_texts(row)
        full_text += tok_
        labels += labels_
        total += chunksize
    print("DONE_TOKENIZING")
    return full_text, labels

In [18]:
full_text, labels = tokenize_texts(df=train)

TOKENIZING comments
0
24000
48000
72000
96000
120000
144000
DONE_TOKENIZING


In [19]:
test_text, test_labels = tokenize_texts(df=test_combined)

TOKENIZING comments
0
24000
48000
72000
96000
120000
144000
DONE_TOKENIZING


In [20]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [21]:
full_text = np.array(full_text)
labels = np.array(labels)

In [22]:
full_text, labels = unison_shuffled_copies(full_text, labels)

In [23]:
train_ratio = .9
num_train = int(len(full_text) * train_ratio)
train_text = full_text[:num_train]
train_label = labels[:num_train]

val_text = full_text[num_train:]
val_label = labels[num_train:]

In [24]:
freq = Counter(p for o in full_text for p in o)

In [25]:
freq.most_common(25)

[('.', 518563),
 ('the', 496093),
 (',', 470070),
 ('"', 379000),
 ('to', 297298),
 ('\n', 245545),
 ('i', 239089),
 ('of', 224411),
 ('and', 223817),
 ('you', 217497),
 ('a', 214903),
 ('t_up', 207139),
 ('is', 180721),
 ('1', 164517),
 ('that', 161499),
 ('xbos', 159571),
 ('xfld', 159571),
 ('it', 148152),
 ('in', 144913),
 ('\n\n', 115219),
 ('for', 102644),
 ('this', 97187),
 ('not', 96865),
 ('on', 89843),
 ('-', 88567)]

In [26]:
max_vocab = 60000
min_freq = 2

In [27]:
int_to_string = [o for o, c in freq.most_common(max_vocab) if c > min_freq]
int_to_string.insert(0, '_pad_')
int_to_string.insert(0, '_unk_')

In [28]:
string_to_int = collections.defaultdict(
    lambda: 0,
    {v: k for k, v in enumerate(int_to_string) }
)

In [29]:
train_ratio = .9
num_train = int(len(full_text) * train_ratio)

train_text = full_text[:num_train]
train_label = labels[:num_train]

val_text = full_text[num_train:]
val_label = labels[num_train:]

## Loading WikiText103

In [30]:
# ! wget -nH -r -np -P {DATA_PATH} http://files.fast.ai/models/wt103/

In [31]:
PRE_PATH = Path('data/models/wt103')
PRE_LM_PATH = PRE_PATH/'fwd_wt103.h5'

In [32]:
embedding_size = 400
n_hidden = 1150
n_layers = 3

In [33]:
wgts = torch.load(
    PRE_LM_PATH, map_location = lambda storage, loc: storage
)

encoder_wgts = to_np(wgts['0.encoder.weight'])
row_m = encoder_wgts.mean(axis=0)

In [34]:
itos_wikitext = pickle.load((PRE_PATH/'itos_wt103.pkl').open('rb'))
stoi_wikitext = collections.defaultdict(
    lambda: -1,
    {v: k for k, v in enumerate(itos_wikitext)},
)

In [35]:
vs = len(int_to_string)

In [36]:
new_wgts = np.zeros((vs, embedding_size), dtype=np.float32)
for i, w in enumerate(int_to_string):
    r = stoi_wikitext[w]
    new_wgts = encoder_wgts[r] if r >= 0 else row_m

In [37]:
wgts['0.encoder.weight'] = T(new_wgts)
wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_wgts))
wgts['1.decoder.weight'] = T(np.copy(new_wgts))

## Language Model

In [38]:
def pretty_size(size):
    """Pretty prints a torch.Size object"""
    assert(isinstance(size, torch.Size))
    return " × ".join(map(str, size))

def dump_tensors(gpu_only=True):
    """Prints a list of the Tensors being tracked by the garbage collector."""
    import gc
    total_size = 0
    for obj in gc.get_objects():
        try:
            if torch.is_tensor(obj):
                if not gpu_only or obj.is_cuda:
                    print("%s:%s%s %s" % (type(obj).__name__, 
                                          " GPU" if obj.is_cuda else "",
                                          " pinned" if obj.is_pinned else "",
                                          pretty_size(obj.size())))
                    total_size += obj.numel()
            elif hasattr(obj, "data") and torch.is_tensor(obj.data):
                if not gpu_only or obj.is_cuda:
                    print("%s → %s:%s%s%s%s %s" % (type(obj).__name__, 
                                                   type(obj.data).__name__, 
                                                   " GPU" if obj.is_cuda else "",
                                                   " pinned" if obj.data.is_pinned else "",
                                                   " grad" if obj.requires_grad else "", 
                                                   " volatile" if obj.volatile else "",
                                                   pretty_size(obj.data.size())))
                    total_size += obj.data.numel()
        except Exception as e:
            pass        
    print("Total size:", total_size)

In [39]:
dump_tensors()

FloatTensor: GPU pinned 400
FloatTensor: GPU pinned 400
FloatTensor: GPU pinned 400




Total size: 1200


In [40]:
wd = 1e-7
bptt = 70
bs = 32
opt_fn = partial(optim.Adam, betas=(.8, .99))

In [41]:
train_lm = np.array([[string_to_int[o] for o in p] for p in train_text])
val_lm = np.array([[string_to_int[o] for o in p] for p in val_text])

In [42]:
trn_dl = LanguageModelLoader(np.concatenate(train_lm), bs, bptt)
val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)


md = LanguageModelData(
    DATA_PATH, 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt
)

In [43]:
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7

In [44]:
learner= md.get_model(
    opt_fn, embedding_size, n_hidden, n_layers, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2],
    dropoute=drops[3], dropouth=drops[4]
)

learner.metrics = [accuracy]
learner.freeze_to(-1)

In [45]:
learner.model.load_state_dict(wgts)
learner.freeze_to(-1)

In [None]:
# learner.lr_find()
# learner.sched.plot()
# Above code suggested 1e-4

In [49]:
lr = 1e-4
lrs = lr

In [None]:
learner.fit(lr/2, 1, wds=wd, use_clr=(32, 2), cycle_len=1)

In [None]:
learner.unfreeze()

In [None]:
learner.fit(lrs, 1, wds=wd, use_clr=(20, 10), cycle_len=2)

In [None]:
learner.save('lm2')

In [None]:
learner.save_encoder('lm2_enc')

In [None]:
learner.sched.plot_loss()

In [None]:
learner.fit(
    lrs,
    1,
    wds=wd, use_clr=(20, 10),
    cycle_len=4,
    best_save_name='lm3'
)

In [46]:
learner.load('lm3')

In [None]:
learner.save_encoder('lm3_enc')

In [None]:
learner.sched.plot_loss()

In [52]:
learner.load('lm3')

In [53]:
learner.fit(
    lrs,
    1,
    wds=wd, use_clr=(20, 10),
    cycle_len=8,
    best_save_name='lm4'
)

HBox(children=(IntProgress(value=0, description='Epoch', max=8), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                     
    0      4.646636   4.523145   0.297126  
    1      4.540537   4.437536   0.302183                     
    2      4.51278    4.406138   0.304391                     
    3      4.512773   4.390575   0.305198                     
    4      4.486791   4.378878   0.306142                     
    5      4.499806   4.374559   0.306392                     
    6      4.487277   4.370751   0.306662                     
    7      4.506106   4.36635    0.307296                     


[array([4.36635]), 0.3072959093754521]

In [54]:
learner.load('lm4')

In [55]:
learner.save_encoder('lm4_encoder')

## Classifier Model

In [None]:
wd = 1e-7
bptt = 70
bs = 32
opt_fn = partial(optim.Adam, betas=(.8, .99))

In [None]:
trn_labels = np.matrix(train_label)
val_labels = np.matrix(val_label)
test_labels = np.matrix(test_labels)

In [None]:
val_label

In [None]:
c = len(list_classes)
min_lbl = trn_labels.min()

In [None]:
min_lbl

In [None]:
trn_clas = np.array([[string_to_int[o] for o in p] for p in train_text])
val_clas = np.array([[string_to_int[o] for o in p] for p in val_text])
test_clas = np.array([[string_to_int[o] for o in p] for p in test_text])

In [None]:
len(trn_clas), len(trn_labels), len(val_clas), len(val_labels), len(test_clas), len(test_labels)

In [None]:
trn_ds = TextDataset(trn_clas, trn_labels.astype('float'))
val_ds = TextDataset(val_clas, val_labels.astype('float'))
test_ds = TextDataset(test_clas, test_labels.astype('float'))

trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)
val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))

trn_dl = DataLoader(trn_ds, bs//4, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)
val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)
test_dl = DataLoader(test_ds, bs, transpose=True, num_workers=1, pad_idx=1)

md = ModelData(DATA_PATH, trn_dl, val_dl, test_dl)

In [None]:
dps = np.array([0.4,0.5,0.05,0.3,0.4])*.3

In [None]:
m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=embedding_size, n_hid=n_hidden, n_layers=n_layers, pad_token=1,
          layers=[embedding_size*3, 50, c], drops=[dps[4], 0.1],
          dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])

In [None]:
learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)
learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)

In [None]:
from sklearn.metrics import log_loss

def cwm_loss(y_pred, y_true):  # column-wise-mean log loss
        y_true_np = y_true.float().cpu().numpy()
        y_pred_np = F.sigmoid(y_pred).cpu().numpy()
        return _cwm_loss(y_pred_np, y_true_np)

def _cwm_loss(y_pred_np, y_true_np, is_test=False):
        if is_test:
            y_pred_np = y_pred_np[y_true_np[:, 0] > -1]
            y_true_np = y_true_np[y_true_np[:, 0] > -1]
        return np.mean([1 - np.abs(y_true_np[:, i] - y_pred_np[:, i])
                        for i in range(6)])

In [None]:
learn.crit = nn.BCEWithLogitsLoss()

In [None]:
learn.metrics = [cwm_loss]

In [None]:
wd = 1e-7
lrs= 1e-4
learn.load_encoder('lm3_enc')

In [None]:
learn.freeze_to(-1)

In [None]:
learn.lr_find(lrs/1000)

In [None]:
learn.sched.plot()

In [None]:
learn.fit(lrs, 1, wds=wd, use_clr=(32, 2), cycle_len=1)

In [None]:
learn.unfreeze()

In [None]:
learn.fit(lrs, 1, wds=wd, use_clr=(20, 10), cycle_len=8, best_save_name='clas2')

## Predict on Test Set

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
learn.load('clas_2')

In [None]:
result = learn.predict(is_test=True)

In [None]:
result = sigmoid(result)
result

In [None]:
_cwm_loss(np.round(result), np.array(test_labels), is_test=True)