#  Классификация документов с помощью библиотеки fastText

### Установка fastText

В папку с настоящим "ноутбуком".
1. git clone https://github.com/facebookresearch/fastText.git
2. cd fastText
3. make

In [1]:
from sklearn.datasets import fetch_20newsgroups
import re
import nltk

## Загрузка датасета 20 newsgroups

In [2]:
categories = [
        'talk.politics.misc',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
        'rec.autos',
    ]
remove = ('headers', 'footers', 'quotes')

print("Loading 20 newsgroups dataset for categories:")
print(categories if categories else "all")

data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42, remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42, remove=remove)
print('data loaded')

Loading 20 newsgroups dataset for categories:
['talk.politics.misc', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.autos']
data loaded


## Предобработка текста

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/annakupriyanova/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/annakupriyanova/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [30]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/annakupriyanova/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [39]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string

def preprocess_text(text):
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    # stem the words
    porter = PorterStemmer()
    words = [porter.stem(word) for word in words]
    # lemmatize
    # lemmatizer = WordNetLemmatizer()
    # words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [35]:
preprocess_text(data_train.data[0])

'wife looked drove one last fall model wayyyyyyyyy underpowered could imagine driving mountain colorado anything approaching highway speed read new model newer improved hp engine quite serious laughed salesman face said broken feel powerful used driving jeep engine believe land cruiser land yacht same also underpowered personal opinion big car roomy nothing spectacular'

## Структурирование датасета в формате библиотеки fasttext

Формат библиотеки fasttext: каждая строка содержит запись вида \__label\__{label_id} text. Например, \__label\__1 computers will never die

In [13]:
def strip_formatting(string):
    string = string.lower()
    string = re.sub(r"\d", "", string)
    string = re.sub(r"\n\t", " ", string)
    string = re.sub(r"([.!?,'/()])", r" \1 ", string)
    return string

In [22]:
def fasttext_format_dataset(data, target, filename):
    with open(filename, 'w') as fout:
        for x, y in zip(data, target):
            # x = strip_formatting(x)
            x = preprocess_text(x)
            line = '__label__{} {}'.format(y, x)
            fout.write(line + '\n')

In [40]:
# create train dataset in fasttext format
fasttext_format_dataset(data_train.data, data_train.target, 'ft_train.txt')

# create test dataset in fasttext format
fasttext_format_dataset(data_test.data, data_test.target, 'ft_test.txt')

print('Train and test datasets are ready.')

Train and test datasets are ready.


## Обучение и тестирование модели fasttext

In [41]:
%%bash
./fastText/fasttext supervised -input ft_train.txt -output ft_model

Read 0M words
Number of words:  21309
Number of labels: 5
Progress:  94.2% words/sec/thread:  870869 lr:  0.005762 loss:  1.601482 ETA:   0h 0mProgress: 100.9% words/sec/thread:  495533 lr: -0.000880 loss:  1.595745 ETA:   0h 0mProgress: 100.0% words/sec/thread:  494845 lr:  0.000000 loss:  1.595745 ETA:   0h 0m


In [42]:
%%bash
./fastText/fasttext test ft_model.bin ft_test.txt

N	1740
P@1	0.329
R@1	0.329


### Эксперименты с параметрами обучения

In [92]:
%%bash
./fastText/fasttext supervised -input ft_train.txt -output ft_model -epoch 100 -lr 0.8 -dim 100

Read 0M words
Number of words:  21309
Number of labels: 5
Progress:   3.6% words/sec/thread:  722810 lr:  0.771072 loss:  1.296156 ETA:   0h 0mProgress:   7.8% words/sec/thread:  796602 lr:  0.737761 loss:  1.071087 ETA:   0h 0mProgress:  12.0% words/sec/thread:  823038 lr:  0.704335 loss:  0.900524 ETA:   0h 0mProgress:  16.2% words/sec/thread:  840714 lr:  0.670246 loss:  0.827891 ETA:   0h 0mProgress:  20.2% words/sec/thread:  847481 lr:  0.638446 loss:  0.765375 ETA:   0h 0mProgress:  23.5% words/sec/thread:  827399 lr:  0.611915 loss:  0.719913 ETA:   0h 0mProgress:  27.0% words/sec/thread:  814817 lr:  0.583731 loss:  0.678697 ETA:   0h 0mProgress:  31.0% words/sec/thread:  821143 lr:  0.551736 loss:  0.662111 ETA:   0h 0mProgress:  34.7% words/sec/thread:  817851 lr:  0.522371 loss:  0.633675 ETA:   0h 0mProgress:  36.7% words/sec/thread:  780933 lr:  0.506001 loss:  0.605237 ETA:   0h 0mProgress:  39.0% words/sec/thread:  755488 lr:  0.488123 loss:  0.591354 ETA:   

In [93]:
%%bash
./fastText/fasttext test ft_model.bin ft_test.txt

N	1740
P@1	0.781
R@1	0.781


### Переобучим модель, добавив биграммы

In [94]:
%%bash
./fastText/fasttext supervised -input ft_train.txt -output ft_model_2grams -epoch 100 -lr 0.8 -dim 100 -wordNgrams 2

Read 0M words
Number of words:  21309
Number of labels: 5
Progress:   1.5% words/sec/thread:  330687 lr:  0.787939 loss:  1.619662 ETA:   0h 0mProgress:   3.4% words/sec/thread:  367811 lr:  0.772869 loss:  1.491253 ETA:   0h 0mProgress:   5.3% words/sec/thread:  380461 lr:  0.757443 loss:  1.403956 ETA:   0h 0mProgress:   7.2% words/sec/thread:  384323 lr:  0.742361 loss:  1.320187 ETA:   0h 0mProgress:   9.1% words/sec/thread:  388638 lr:  0.726929 loss:  1.223613 ETA:   0h 0mProgress:  11.0% words/sec/thread:  390921 lr:  0.711662 loss:  1.146601 ETA:   0h 0mProgress:  12.9% words/sec/thread:  392958 lr:  0.696698 loss:  1.095879 ETA:   0h 0mProgress:  14.8% words/sec/thread:  393924 lr:  0.681741 loss:  1.002854 ETA:   0h 0mProgress:  16.7% words/sec/thread:  394505 lr:  0.666715 loss:  0.965254 ETA:   0h 0mProgress:  18.6% words/sec/thread:  395916 lr:  0.651150 loss:  0.937764 ETA:   0h 0mProgress:  20.4% words/sec/thread:  396198 lr:  0.636484 loss:  0.912760 ETA:   

In [95]:
%%bash
./fastText/fasttext test ft_model_2grams.bin ft_test.txt

N	1740
P@1	0.779
R@1	0.779


## Классификация новых текстов

In [96]:
print('Categories:')
for i in range(5):
    print(f'{i} - {data_train.target_names[i]}')

Categories:
0 - comp.graphics
1 - rec.autos
2 - sci.space
3 - talk.politics.misc
4 - talk.religion.misc


In [99]:
preprocess_text("Computers are cool stuff. I'd  like to work with software.")

'comput cool stuff like work softwar'

In [100]:
%%bash
./fastText/fasttext predict ft_model.bin -
comput cool stuff like work softwar

__label__0


### Топ-3 предсказанных категорий

In [101]:
preprocess_text("Russian President signs new migration policy concept.")

'russian presid sign new migrat polici concept'

In [102]:
%%bash
./fastText/fasttext predict ft_model.bin - 3
russian presid sign new migrat polici concept

__label__3 __label__1 __label__2


# Задание на практику

Поменять параметры классификатора и/или алгоритм предобработки датасета, чтобы добиться более высоких показателей точности и полноты.