In [6]:
from fastai.text import *
import csv
import xml
import xml.sax
import numpy as np
import pandas as pd
import sklearn

In [3]:
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

In [7]:
groundTruth = {}
class GroundTruthHandler(xml.sax.ContentHandler):
    '''
    Read labels from xml and save in groudTruth as groundTruth[article_id] = label
    '''
    def __init__(self):
        xml.sax.ContentHandler.__init__(self)

    def startElement(self, name, attrs):
        if name == "article":
            articleId = attrs.getValue("id")
            hyperpartisan = attrs.getValue("hyperpartisan")
            groundTruth[articleId] = hyperpartisan

def get_texts(data_path, label_path, num):
    with open(label_path) as groundTruthDataFile:
        xml.sax.parse(groundTruthDataFile, GroundTruthHandler())
        
    texts, labels = [],[]
    with open(data_path, encoding = 'utf8') as file:
        for line_count, line in enumerate(file):
            if line_count < num:
                articleId = line.split('::')[0]
                text = line.split('::')[2]
                text = text.split()[:300]
                texts.append(' '.join(text))           
                if groundTruth[articleId] == 'true':
                    labels.append(int(1))
                else:
                    labels.append(int(0))

    return np.array(texts),np.array(labels)

In [8]:
train_set = "../data/articles-training-bypublisher.txt"
train_label = '../data/ground-truth-training-bypublisher.xml'
dev_set = "../data/articles-validation-bypublisher.txt"
dev_label = '../data/ground-truth-validation-bypublisher.xml'
trn_texts,trn_labels = get_texts(train_set, train_label, 20000)
val_texts,val_labels = get_texts(dev_set, dev_label, 6000)

In [9]:
len(trn_texts), len(val_texts)

(20000, 6000)

In [11]:
np.random.seed(42)
trn_idx = np.random.permutation(len(trn_texts))
val_idx = np.random.permutation(len(val_texts))
trn_texts = trn_texts[trn_idx]
val_texts = val_texts[val_idx]
trn_labels = trn_labels[trn_idx]
val_labels = val_labels[val_idx]

col_names = ['labels','text']
df_trn = pd.DataFrame({'text':trn_texts, 'labels':trn_labels}, columns=col_names)
df_val = pd.DataFrame({'text':val_texts, 'labels':val_labels}, columns=col_names)

In [12]:
print(df_trn[df_trn['labels'] == 1].size, df_trn[df_trn['labels'] == 0].size)
print(df_val[df_val['labels'] == 1].size, df_val[df_val['labels'] == 0].size)

df_trn.to_csv('train.csv', header=False, index=False)
df_val.to_csv('test.csv', header=False, index=False)

20034 19966
6106 5894


In [34]:
trn_texts,val_texts = sklearn.model_selection.train_test_split(
    np.concatenate([trn_texts,val_texts]), test_size=0.1)
print(len(trn_texts), len(val_texts))

In [47]:
df_trn = pd.DataFrame({'text':trn_texts, 'labels':[0]*len(trn_texts)}, columns=col_names)
df_val = pd.DataFrame({'text':val_texts, 'labels':[0]*len(val_texts)}, columns=col_names)

23400 2600


In [76]:
import html
re1 = re.compile(r'  +')
def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

def get_texts(df, n_lbls=1):
    labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
    texts = f'\n{BOS} {FLD} 1 ' + df.iloc[:, n_lbls].astype(str)
    for i in range(n_lbls+1, len(df.columns)): texts += f' {FLD} {i-n_lbls} ' + df.iloc[:,i].astype(str)
    texts = list(texts.apply(fixup).values)

    tok = Tokenizer(n_cpus=4).process_all(texts)
    return tok, list(labels)

def get_all(df, n_lbls):
    tok, labels = [], []
    for i, r in enumerate(df):
        print(i)
        tok_, labels_ = get_texts(r, n_lbls)
        tok += tok_;
        labels += labels_
    return tok, labels

In [45]:
##chunksize = 24000
#df_trn = pd.read_csv('./train.csv', header=None, chunksize=chunksize, encoding = 'utf8')
#df_val = pd.read_csv('./test.csv', header=None, chunksize=chunksize, encoding = 'utf8')

In [52]:
def get_one(df, n_lbls):
    tok, labels = [], []
    tok_, labels_ = get_texts(df, n_lbls)
    tok += tok_;
    labels += labels_
    return tok, labels

In [77]:
tok_trn, trn_labels = get_one(df_trn, 1)
tok_val, val_labels = get_one(df_val, 1)

In [78]:
freq = Counter(p for o in tok_trn for p in o)
freq.most_common(25)

[('xxmaj', 951876),
 ('the', 360860),
 (',', 310915),
 ('.', 261413),
 ('to', 164550),
 ('of', 157525),
 ('and', 143651),
 ('a', 142567),
 ('in', 122895),
 ('xxup', 78085),
 ('that', 76154),
 ('"', 75177),
 ('-', 72560),
 ("'s", 72232),
 ('for', 59446),
 ('is', 58539),
 ('on', 53133),
 ('it', 44515),
 ('”', 39773),
 ('with', 37946),
 ('was', 36733),
 ('as', 36358),
 ('at', 30100),
 ('by', 29838),
 ('he', 29690)]

In [79]:
max_vocab = 60000
min_freq = 2
#index to string
itos = [o for o, c in freq.most_common(max_vocab) if c>min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

#string to index
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})

vs=len(itos)
print('total vocabulary:', vs)

47580

In [80]:
trn_lm = np.array([[stoi[o] for o in p] for p in tok_trn])
val_lm = np.array([[stoi[o] for o in p] for p in tok_val])

In [99]:
em_sz, nh, nl = 400, 1150, 3
PRE_LM_PATH = 'fwd_wt103.h5'
wgts = torch.load(PRE_LM_PATH, map_location=lambda storage, loc: storage)

In [84]:
enc_wgts = to_np(wgts['0.encoder.weight'])
row_m = enc_wgts.mean(0)
itos_wiki = pickle.load(open('itos_wt103.pkl','rb'))
stoi_wiki = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(itos_wiki)})

In [91]:
new_w = np.zeros((vs, em_sz), dtype=np.float32)

# fill the matrix with wiki embeddings if it exists, else fill with the mean of the embedding
for i,w in enumerate(itos):
    r = stoi_wiki[w]
    new_w[i] = enc_wgts[r] if r>=0 else row_m

wgts['0.encoder.weight'] = torch.tensor(new_w)
wgts['0.encoder_with_dropout.embed.weight'] = torch.tensor(np.copy(new_w))
wgts['1.decoder.weight'] = torch.tensor(np.copy(new_w))

In [92]:
wd=1e-7
bptt=70
bs=52
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))
trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)
val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)
md = LanguageModelData('.', 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7

AttributeError: 'numpy.ndarray' object has no attribute 'x'

In [None]:
learner= md.get_model(opt_fn, em_sz, nh, nl, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])

learner.metrics = [accuracy]
learner.freeze_to(-1)
learner.model.load_state_dict(wgts)

In [None]:
lr=1e-3
lrs = lr
learner.fit(lrs/2, 1, wds=wd, use_clr=(32,2), cycle_len=1)

In [None]:
learner.save('lm_last_ft')
learner.load('lm_last_ft')
learner.unfreeze()
learner.lr_find(start_lr=lrs/10, end_lr=lrs*10, linear=True)

In [13]:
df_trn['text'] = df_trn['text'].str.replace("[^a-zA-Z]", " ")
df_val['text'] = df_val['text'].str.replace("[^a-zA-Z]", " ")

In [14]:
df_trn.shape, df_val.shape

((20000, 2), (6000, 2))

In [16]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "")

In [18]:
# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)

In [19]:
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.7)

In [21]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy


KeyboardInterrupt: 

In [None]:
learn.save_encoder('ft_enc')

In [None]:
learn = text_classifier_learner(data_clas, drop_mult=0.7)
learn.load_encoder('ft_enc')

In [None]:
learn.fit_one_cycle(1, 1e-2)