In [None]:
!pip install pyonmttok fasttext

In [None]:
!git clone https://github.com/facebookresearch/fastText.git
!cd fastText && mkdir build && cd build && cmake .. && make && make install

In [None]:
!rm -f en_tg_train.tar.gz
!wget https://www.dropbox.com/s/umd8tyx4wz1wquq/en_tg_train.tar.gz
!rm -f en_tg_train.json
!tar -xzvf en_tg_train.tar.gz
!rm en_tg_train.tar.gz

In [None]:
!rm -f en_tg_test.tar.gz
!wget https://www.dropbox.com/s/rw674iic8x5udb3/en_tg_test.tar.gz
!rm -f en_tg_test.json
!tar -xzvf en_tg_test.tar.gz
!rm en_tg_test.tar.gz

In [None]:
!wget https://www.dropbox.com/s/7qpfgf8bz77h2ss/en_cat_train_raw_markup.tsv
!wget https://www.dropbox.com/s/bszwshgwbrt328k/en_cat_test_raw_markup.tsv
!head -n 2 en_cat_train_raw_markup.tsv

In [None]:
# https://www.kaggle.com/rmisra/news-category-dataset

!rm -f news-category-dataset.zip
!wget https://www.dropbox.com/s/ua18htwqrkwnfpg/news-category-dataset.zip
!unzip news-category-dataset.zip

In [None]:
import pyonmttok
tokenizer = pyonmttok.Tokenizer("conservative")

def preprocess(text):
    text = str(text).strip().replace("\n", " ").replace("\xa0", " ").lower()
    tokens, _ = tokenizer.tokenize(text)
    text = " ".join(tokens)
    return text

In [None]:
import random

def save_to_ft(records, output_file_name, use_preprocess=True):
    with open(output_file_name, "w") as w:
        random.shuffle(records)
        for d in records:
            title = d["title"] if not use_preprocess else preprocess(d["title"])
            text = d["text"] if not use_preprocess else preprocess(d["text"])
            w.write("__label__{} {} {}\n".format(d["res"], title, text))

In [None]:
import json
import random
from collections import Counter, defaultdict
from sklearn.metrics import cohen_kappa_score

def normalize(text):
    return text.replace("\t", " ").replace("\n", " ").replace('"', '').replace("\xa0", " ")

def convert_to_ft(answers_file_name, original_json, output_file_name, min_votes=3, use_preprocess=True):
    with open(answers_file_name, "r") as r:
        header = tuple(next(r).strip().split("\t"))
        records = []
        for line in r:
            fields = line.strip().split("\t")
            assert len(fields) == len(header), fields
            records.append(dict(zip(header, fields)))

    # Filter honeypots out
    records = [r for r in records if not r["GOLDEN:res"]]

    # Normalize fields
    for r in records:
        r.pop("GOLDEN:res", None)
        r.pop("HINT:text", None)
        for key, value in r.items():
            new_key = key.split(":")[-1]
            r[new_key] = r.pop(key)

    # Restore original urls (to fix a bug)
    with open(original_json, "r") as r:
        data = json.load(r)
        title2url = {normalize(d["title"]): d["url"] for d in data}
        for r in records:
            title = normalize(r["title"])
            if title not in title2url:
                continue
            r["url"] = title2url[title]

    # Calc inter-annotator agreement
    annotator2labels = defaultdict(dict)
    unique_keys = list(set([r["url"] for r in records]))
    unique_workers = list(set([r["worker_id"] for r in records]))
    unique_res = list(set([r["res"] for r in records]))
    res2num = {res: i for i, res in enumerate(unique_res)}
    for r in records:
        annotator2labels[r["worker_id"]][r["url"]] = r["res"]
    worker2labels = {}
    for worker_id in unique_workers:
        worker_labels = []
        worker_res = annotator2labels[worker_id]
        for key in unique_keys:
            if key not in worker_res:
                worker_labels.append(-1)
                continue
            worker_labels.append(res2num[worker_res[key]])
        worker2labels[worker_id] = worker_labels
    scores = []
    for w1, labels1 in worker2labels.items():
        for w2, labels2 in worker2labels.items():
            if w1 == w2:
                continue
            fixed_labels1 = []
            fixed_labels2 = []
            for l1, l2 in zip(labels1, labels2):
                if l1 == -1 or l2 == -1:
                    continue
                fixed_labels1.append(l1)
                fixed_labels2.append(l2)
            if fixed_labels1 and fixed_labels2:
                score = cohen_kappa_score(fixed_labels1, fixed_labels2)
                if -1.0 <= score <= 1.0:
                    scores.append(score)
    print("Avg kappa score: {}".format(sum(scores)/len(scores)))

    results = defaultdict(list)
    for r in records:
        results[r["url"]].append(r["res"])

    data = {r["url"]: r for r in records}
    for url, res in results.items():
        res_count = Counter(res)
        if res_count.most_common(1)[0][1] < min_votes:
            data.pop(url)

    rub_cnt = Counter()
    for _, d in data.items():
        rub_cnt[d["res"]] += 1
    print(rub_cnt.most_common())

    save_to_ft(list(data.values()), output_file_name, use_preprocess)

convert_to_ft("en_cat_train_raw_markup.tsv", "en_tg_train.json", "en_cat_train_markup.txt", min_votes=2, use_preprocess=True)
convert_to_ft("en_cat_test_raw_markup.tsv", "en_tg_test.json", "en_cat_test_markup.txt", min_votes=4, use_preprocess=True)

In [None]:
!cat en_cat_train_markup.txt | wc -l
!cat en_cat_test_markup.txt | wc -l

In [None]:
import os
import random

def read_news_category_dataset(input_file, output_file, use_preprocess=True):
    assert os.path.exists(input_file)
    records = []
    cat2res = {
        "POLITICS": ("society", 100.0/32739),
        "ENTERTAINMENT": ("entertainment", 100.0/16058),
        "BUSINESS": ("economy", 300.0/5937),
        "CRIME": ("society", 100.0/3405),
        "ARTS & CULTURE": ("entertainment", 100.0/700),
        "CULTURE & ARTS": ("entertainment", 100.0/700),
        "TECH": ("technology", 300.0/2082),
        "SCIENCE": ("science", 300.0/2178),
        "SPORTS": ("sports", 300.0/4884),
        "HEALTHY LIVING": ("not_news", 300.0/6694),
        "THE WORLDPOST": ("society", 100.0/3405),
        "FOOD & DRINK": ("other", 150.0/6226),
        "STYLE & BEAUTY": ("other", 150.0/9649)
    }

    with open(input_file, "r") as r:
        for line in r:
            data = json.loads(line)
            title = data["headline"]
            text = data["short_description"]
            data["title"] = title
            data["text"] = text
            category = data["category"]
            if category in cat2res:
                res, prob = cat2res[category]
                data["res"] = res
                if random.random() < prob:
                    records.append(data)
            # else:
                # print("Skipping: ", category, title)
    
    rub_cnt = Counter()
    for d in records:
        rub_cnt[d["res"]] += 1
    print(rub_cnt.most_common())
    
    save_to_ft(records, output_file, use_preprocess)
    return records

read_news_category_dataset("News_Category_Dataset_v2.json", "nc_markup.txt")
!head nc_markup.txt
!cat nc_markup.txt | wc -l

In [None]:
!wget https://www.dropbox.com/s/no7x1n8acl5ykif/en_vectors_v2.bin

In [None]:
!wget https://raw.githubusercontent.com/facebookresearch/fastText/master/python/doc/examples/bin_to_vec.py
!python bin_to_vec.py en_vectors_v2.bin > en_vectors_v2.vec

In [None]:
!cat en_cat_train_markup.txt > en_cat_train_all.txt
!cat nc_markup.txt >> en_cat_train_all.txt
!shuf en_cat_train_all.txt > en_cat_train_shuf.txt

In [None]:
import random
with open("en_cat_train_shuf.txt", "r") as r, open("en_cat_train_train.txt", "w") as train, open("en_cat_train_val.txt", "w") as val:
    for line in r:
        if random.random() < 0.1:
            val.write(line)
        else:
            train.write(line)
!cat en_cat_train_val.txt | wc -l

In [None]:
!fasttext supervised -input en_cat_train_train.txt -pretrainedVectors en_vectors_v2.vec -dim 50 -autotune-validation en_cat_train_val.txt -output en_cat_v2 -autotune-modelsize 10M

In [None]:
!fasttext test en_cat_v2.ftz en_cat_test_markup.txt

In [None]:
import fasttext
model = fasttext.load_model("en_cat_v2.ftz")
true_labels = []
predicted_labels = []
errors = []
with open("en_cat_test_markup.txt", "r") as r:
    for line in r:
        words = line.strip().split(" ")
        label = words[0][9:]
        true_labels.append(label)
        text = " ".join(words[1:])
        predicted_label = model.predict([text])[0][0][0][9:]
        if label != predicted_label:
            errors.append((label, predicted_label, text[:100]))
        predicted_labels.append(predicted_label)
for label, predicted_label, text in errors:
    print("T: {} P: {} | {}".format(label, predicted_label, text))