In [None]:
!pip install pyonmttok fasttext

In [None]:
!rm -f en_tg_train.tar.gz
!wget https://www.dropbox.com/s/umd8tyx4wz1wquq/en_tg_train.tar.gz
!rm -f en_tg_train.json
!tar -xzvf en_tg_train.tar.gz
!rm en_tg_train.tar.gz

In [None]:
# https://www.kaggle.com/pariza/bbc-news-summary/data

!rm -f bbc-news-summary.zip
!wget https://www.dropbox.com/s/gq76b24q3x5n1ku/bbc-news-summary.zip
!unzip bbc-news-summary.zip

In [None]:
# https://www.kaggle.com/rmisra/news-category-dataset

!rm -f news-category-dataset.zip
!wget https://www.dropbox.com/s/ua18htwqrkwnfpg/news-category-dataset.zip
!unzip news-category-dataset.zip

In [None]:
# https://www.kaggle.com/snapcrack/all-the-news

!rm -f all-the-news.zip
!wget https://www.dropbox.com/s/bacg3cxckeqw6a9/all-the-news.zip
!unzip all-the-news.zip

In [None]:
import json

with open('en_tg_train.json', "r") as r:
    tg_train_data = json.load(r)

tg_titles = [record["title"] for record in tg_train_data]
tg_texts = [record["text"] for record in tg_train_data]
print(tg_titles[0])
print(tg_texts[0])
print(len(tg_titles))

In [None]:
import os

def get_bbc_texts(input_directory):
    assert os.path.exists(input_directory)
    records = []
    for rubric_dir in os.listdir(input_directory):
        rubric_dir = os.path.join(input_directory, rubric_dir)
        if not os.path.isdir(rubric_dir):
            continue
        for file_name in os.listdir(rubric_dir):
            file_name = os.path.join(rubric_dir, file_name)
            with open(file_name, "r") as r:
                try:
                    content = r.read().replace("\n", " ")
                except Exception as e:
                    continue
                records.append(content)
    return records

bbc_texts = get_bbc_texts("BBC News Summary/News Articles")
print(bbc_texts[0])
print(len(bbc_texts))

In [None]:
import json

nc_texts = []
with open("News_Category_Dataset_v2.json", "r") as r:
    for line in r:
        data = json.loads(line)
        title = data["headline"]
        text = data["short_description"]
        nc_texts.append(title + " " + text)
print(nc_texts[0])
print(len(nc_texts))

In [None]:
import csv
import sys
csv.field_size_limit(sys.maxsize)

all_the_news_files = ("articles1.csv", "articles2.csv", "articles3.csv")
atn_titles = []
atn_texts = []
for file_name in all_the_news_files:
    with open(file_name, "r") as r:
        next(r)
        reader = csv.reader(r, delimiter=',')
        for row in reader:
            _, _, title, _, _, _, _, _, _, text = row
            atn_titles.append(title)
            atn_texts.append(text)
print(atn_titles[0])
print(atn_texts[0])
print(len(atn_titles))

In [None]:
import pyonmttok
import random
tokenizer = pyonmttok.Tokenizer("conservative")

def preprocess(text):
    text = str(text).strip().replace("\n", " ").replace("\xa0", " ").lower()
    tokens, _ = tokenizer.tokenize(text)
    text = " ".join(tokens)
    return text

all_samples = tg_titles + tg_texts + bbc_texts + nc_texts + atn_titles + atn_texts
random.shuffle(all_samples)
processed_all_samples = [preprocess(text) for text in all_samples]
processed_all_samples = [text for text in processed_all_samples if text.strip()]
print(processed_all_samples[0])
print(len(processed_all_samples))

In [None]:
# Clear RAM
del tg_titles
del tg_texts
del bbc_texts
del nc_texts
del all_samples
del atn_titles
del atn_texts

In [None]:
# Clear Disk
!rm -rf "BBC News Summary"
!rm -rf "bbc news summary"
!rm News_Category_Dataset_v2.json
!rm en_tg_train.json
!rm articles1.csv
!rm articles2.csv
!rm articles3.csv

In [None]:
with open("train.txt", "w", encoding="utf-8") as w:
    for sample in processed_all_samples:
        w.write(sample.strip() + "\n")

In [None]:
!tar -czvf en_unsupervised_train.tar.gz train.txt

In [None]:
from fasttext import train_unsupervised

model = train_unsupervised('train.txt', model='skipgram', dim=50, epoch=10, minCount=50, bucket=200000, verbose=2)