In [None]:
!pip install pyonmttok fasttext bs4 pandas

In [None]:
!rm -f ru_tg_train.tar.gz
!wget https://www.dropbox.com/s/1ecl9orr2tagcgi/ru_tg_train.tar.gz
!rm -f ru_tg_train.json
!tar -xzvf ru_tg_train.tar.gz
!rm ru_tg_train.tar.gz

In [None]:
# https://github.com/yutkin/Lenta.Ru-News-Dataset

!rm -f lenta-ru-news.csv.gz
!wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz
!rm -f lenta-ru-news.csv
!gzip -d lenta-ru-news.csv.gz

In [None]:
# https://github.com/RossiyaSegodnya/ria_news_dataset

!rm -f ria.json.gz
!wget https://github.com/RossiyaSegodnya/ria_news_dataset/raw/master/ria.json.gz
!rm -f ria.json
!gzip -d ria.json.gz

In [None]:
import json

with open('ru_tg_train.json', "r") as r:
    tg_train_data = json.load(r)

tg_titles = [record["title"] for record in tg_train_data]
tg_texts = [record["text"] for record in tg_train_data]
print(tg_titles[0])
print(tg_texts[0])
print(len(tg_titles))

In [None]:
import pandas as pd

dataset = pd.read_csv("lenta-ru-news.csv", sep=',', quotechar='\"', escapechar='\\', encoding='utf-8', header=0)
lenta_titles = dataset["title"].tolist()
lenta_texts = dataset["text"].tolist()
print(lenta_titles[0])
print(lenta_texts[0])
print(len(lenta_titles))

In [None]:
from bs4 import BeautifulSoup

ria_titles = []
ria_texts = []
with open("ria.json", "r", encoding="utf-8") as r:
    for line in r:
        data = json.loads(line.strip())
        title = data["title"]
        text = data["text"]
        clean_text = str(BeautifulSoup(text, 'html.parser').text)
        if not clean_text or not title:
            continue
        ria_titles.append(title)
        ria_texts.append(clean_text)
print(ria_titles[0])
print(ria_texts[0])
print(len(ria_titles))

In [None]:
import pyonmttok
import random
tokenizer = pyonmttok.Tokenizer("conservative", joiner_annotate=False)

def preprocess(text):
    text = str(text).strip().replace("\n", " ").replace("\xa0", " ").lower()
    tokens, _ = tokenizer.tokenize(text)
    text = " ".join(tokens)
    return text

# Each dataset should be equally represented
lenta_sample_rate = float(len(tg_titles)) / len(lenta_titles)
sampled_lenta_titles = [title for title in lenta_titles if random.random() < lenta_sample_rate]
sampled_lenta_texts = [text for text in lenta_texts if random.random() < lenta_sample_rate]

ria_sample_rate = float(len(tg_titles)) / len(ria_titles)
sampled_ria_titles = [title for title in ria_titles if random.random() < ria_sample_rate]
sampled_ria_texts = [text for text in ria_texts if random.random() < ria_sample_rate]

all_samples = tg_titles + tg_texts + sampled_lenta_titles + sampled_lenta_texts + sampled_ria_titles + sampled_ria_texts

random.shuffle(all_samples)
processed_all_samples = [preprocess(text) for text in all_samples]
processed_all_samples = [text for text in processed_all_samples if text.strip()]
print(processed_all_samples[0])

In [None]:
# Clear RAM
del lenta_titles
del lenta_texts
del ria_titles
del ria_texts
del tg_titles
del tg_texts
del all_samples
del sampled_ria_titles
del sampled_ria_texts
del sampled_lenta_titles
del sampled_lenta_texts

In [None]:
# Clear Disk
!rm lenta-ru-news.csv
!rm ria.json
!rm ru_tg_train.json

In [None]:
with open("train.txt", "w", encoding="utf-8") as w:
    for sample in processed_all_samples:
        w.write(sample.strip() + "\n")

In [None]:
!tar -czvf ru_unsupervised_train.tar.gz train.txt

In [None]:
# I prefer to run this on the machine with many threads. In Colab this cell will be running for eternity.

from fasttext import train_unsupervised

model = train_unsupervised('train.txt', model='skipgram', dim=50, epoch=10, minCount=100, bucket=200000, verbose=2)