In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.text import *
from fastai import *

In [2]:
from pathlib import Path

In [3]:
DATA_PATH = Path('data/comments/')
TEST_CSV = DATA_PATH/'test.csv'
TRAIN_CSV = DATA_PATH/'train.csv'
TRAIN_LABELS_CSV = DATA_PATH/'test_labels.csv'

## Loading Data

In [4]:
chunksize = 24000

In [5]:
CLAS_PATH=Path('data/comment_clas/')
CLAS_PATH.mkdir(exist_ok=True)

LM_PATH=Path('data/comment_lm/')
LM_PATH.mkdir(exist_ok=True)

In [6]:
test = pd.read_csv(TEST_CSV)
test_labels = pd.read_csv(TRAIN_LABELS_CSV)
train = pd.read_csv(TRAIN_CSV, chunksize=chunksize, header='infer')

In [7]:
test_combined = pd.merge(
    test, test_labels
)

In [8]:
test_combined.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,":If you have a look back at the source, the in...",-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,I don't anonymously edit articles at all.,-1,-1,-1,-1,-1,-1


In [9]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

## Configuring Training Data

In [10]:
rel = re.compile(r'  +')
def fixup(x: str):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return rel.sub(' ', x)

In [11]:
BOS = 'xbos'
FLD = 'xfld'

In [12]:
def get_texts(row):
    labels = row[list_classes].values.shape
    texts = f'\n{BOS} {FLD} 1 ' + row['comment_text'].astype(str)
    texts = texts.apply(fixup).values.astype(str)
    tok = Tokenizer().proc_all_mp(partition_by_cores(texts))
    return list(tok), list(labels)

In [13]:
def tokenize_texts(df):
    full_text = []
    labels = []
    print(f"TOKENIZING comments")
    total = 0
    for i, row in enumerate(df):
        print(total)
        tok_, labels_ = get_texts(row)
        full_text += tok_
        labels += labels_
        total += chunksize
    print("DONE_TOKENIZING")
    return full_text, labels

In [14]:
full_text, labels = tokenize_texts(df=train)

TOKENIZING comments
0
24000
48000
72000
96000
120000
144000
DONE_TOKENIZING


In [15]:
freq = Counter(p for o in full_text for p in o)

In [16]:
max_vocab = 60000
min_freq = 2

In [17]:
int_to_string = [o for o, c in freq.most_common(max_vocab) if c > min_freq]
int_to_string.insert(0, '_pad_')
int_to_string.insert(0, '_unk_')

In [18]:
string_to_int = collections.defaultdict(
    lambda: 0,
    {v: k for k, v in enumerate(int_to_string) }
)

## Loading WikiText103

In [None]:
# ! wget -nH -r -np -P {PATH} http://files.fast.ai/models/wt103/

In [30]:
PRE_PATH = Path('data/models/wt103')
PRE_LM_PATH = PRE_PATH/'fwd_wt103.h5'

In [31]:
embedding_size = 400
n_hidden = 1150
n_layers = 3

In [34]:
wgts = torch.load(
    PRE_LM_PATH, map_location = lambda storage, loc: storage
)

encoder_wgts = to_np(wgts['0.encoder.weight'])
row_m = encoder_wgts.mean(axis=0)

In [37]:
itos_wikitext = pickle.load((PRE_PATH/'itos_wt103.pkl').open('rb'))
stoi_wikitext = collections.defaultdict(
    lambda: -1,
    {v: k for k, v in enumerate(itos_wikitext)},
)

In [39]:
vs = len(int_to_string)

In [41]:
new_wgts = np.zeros((vs, embedding_size), dtype=np.float32)
for i, w in enumerate(int_to_string):
    r = stoi_wikitext[w]
    new_wgts = encoder_wgts[r] if r >= 0 else row_m

In [43]:
wgts['0.encoder.weight'] = T(new_wgts)
wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_wgts))
wgts['1.decoder.weight'] = T(np.copy(new_wgts))

## Language Model

In [45]:
wd = 1e-7
bptt = 70
bs = 52
opt_fn = partial(optim.Adam, betas=(.8, .99))

In [50]:
trn_lm = np.array([[string_to_int[o] for o in p] for p in full_text])

In [53]:
trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)
md = LanguageModelData(DATA_PATH, 1, vs, trn_dl, bs=bs, bptt=bptt, val_dl=None)

In [54]:
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7

In [57]:
learner= md.get_model(opt_fn, embedding_size, n_hidden, n_layers, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])

learner.metrics = [accuracy]
learner.freeze_to(-1)

In [58]:
learner.model.load_state_dict(wgts)
learner.freeze_to(-1)

In [None]:
learner.lr_find()
learner.sched.plot_lr()

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

  0%|          | 0/3866 [00:00<?, ?it/s]

## Classifier Model

In [None]:
wd = 1e-7
bptt = 70
bs = 52
opt_fn = pratial(optim.Adam, betas=(.8, .99))

In [None]:
trn_comments = np.matrix(full_text)
trn_labels = np.matrix(labels)

In [None]:
min_lbl = trn_labels.min()
trn_labels -= min_lbl
val_labels -= min_lbl
c = int(trn_labels.max()) + 1

In [None]:
trn_ds = TextDataset(trn_clas, trn_labels)
val_ds = TextDataset(val_clas, val_labels)
trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)
val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))
trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)
val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)
md = ModelData(PATH, trn_dl, val_dl)

In [None]:
dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.5

In [None]:
m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,
          layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],
          dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])