# Распознавание языка текста

<hr>

С.Ю. Папулин (papulin.study@yandex.ru)

### Содержание

- [Статический текст](#Статический-текст)
- [Динамический текст](#Динамический-текст)
    - [Построение модели](#Построение-модели)
    - [Проверка динамического распознавания](#Проверка-динамического-распознавания)

Подключение библиотек:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

## Статический текст

[Набор данных](https://huggingface.co/datasets/papluca/language-identification)

In [None]:
"""
20 Languages Dataset:
https://huggingface.co/datasets/papluca/language-identification
"""

from os import makedirs, remove
from os.path import exists, join
import gzip

from sklearn.datasets.base import RemoteFileMetadata, _fetch_remote
from sklearn.datasets import get_data_home
from sklearn.utils import Bunch

import numpy as np
import pandas as pd
import logging


logger = logging.getLogger(__name__)


ARCHIVES = [
    RemoteFileMetadata(
        filename='languages_train.csv',
        url='https://huggingface.co/datasets/papluca/language-identification/resolve/main/train.csv',
        checksum=('f180d78a1f0e758fd33bb1bae37f62eebc538d78ece2affb3d05a967850ba474')),
    RemoteFileMetadata(
        filename='languages_test.csv',
        url='https://huggingface.co/datasets/papluca/language-identification/resolve/main/test.csv',
        checksum=('cb7dfe272142815573b735b5d555d42d28d0d648187020f2d2eb3eebd772e759'))
   
]


def fetch_languages(data_home=None, download_if_missing=True, subset='all', return_X_y=False):  
    data_home = get_data_home(data_home=data_home)
    if not exists(data_home):
        makedirs(data_home)
    for archive in ARCHIVES:
        filepath = join(data_home, archive.filename)
        if not exists(filepath):
            if not download_if_missing:
                raise IOError("Data not found and `download_if_missing` is False")
            logger.info('Downloading Languages from {} to {}'.format(
                archive.url, filepath))
            archive_path = _fetch_remote(archive, dirname=data_home)
    if return_X_y:
        DESCR = (
            '20 Languages Dataset\n'
            '--------------------\n'
            'The Language Identification dataset is a collection of 90k samples consisting of text passages and corresponding language label. This dataset was created by collecting data from 3 sources: [Multilingual Amazon Reviews Corpus](https://huggingface.co/datasets/amazon_reviews_multi), [XNLI](https://huggingface.co/datasets/xnli), and [STSb Multi MT](https://huggingface.co/datasets/stsb_multi_mt).\n'
            '\n'
            'The Language Identification dataset contains text in 20 languages, which are:\n'
            'arabic (ar), bulgarian (bg), german (de), modern greek (el), english (en), spanish (es), french (fr), hindi (hi), italian (it), japanese (ja), dutch (nl), polish (pl), portuguese (pt), russian (ru), swahili (sw), thai (th), turkish (tr), urdu (ur), vietnamese (vi), and chinese (zh)\n'
            '\n'
            'For each instance, there is a string for the text and a string for the label (the language tag). Here is an example:\n'
            "{'labels': 'fr', 'text': 'Conforme à la description, produit pratique.'}"
        )
        if subset == 'train':
            train_df = _load_X_y(data_home, 'train')
            return Bunch(
                data=train_df,
                DESCR=DESCR
            )
        elif subset == 'test':
            test_df = _load_X_y(data_home, 'test')
            return Bunch(
                data=test_df,
                DESCR=DESCR
            )
        train_df = _load_X_y(data_home, 'train')
        test_df = _load_X_y(data_home, 'test')
        return Bunch(
            data={'train': train_df, 'test': test_df},
            DESCR=DESCR
        )


def _load_X_y(path, subset='train'):
    return pd.read_csv(join(path, 'languages_{}.csv'.format(subset)))


In [None]:
# Загрузка данных
dataset = fetch_languages(return_X_y=True)

# Вывод описания
print(dataset.DESCR)

In [None]:
df = dataset.data['train']
df.head()

In [None]:
# df = pd.read_csv('../data/lang_detector/train.csv')
# df.head()

In [None]:
df.describe()

In [None]:
# Количество текстов по каждому классу
df['labels'].value_counts()

In [None]:
# Среднее количество символов в текстах по каждому классу
df.groupby('labels').agg(
    lambda group: group['text'].str.len().mean()
)

In [None]:
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

In [None]:
pipeline.fit(df['text'], df['labels'])

In [None]:
df_test = pd.read_csv('../data/lang_detector/test.csv')

In [None]:
pipeline.score(df_test['text'], df_test['labels'])

## Динамический текст

### Построение модели

In [None]:
import re

In [None]:
# Исходные данные
df.head()

In [None]:
# Шаблон для делителя строки на слова
COMPILER = re.compile("\W+", re.UNICODE)


def split_sentence(lang, text):
    s = list()
    for word in set(COMPILER.split(text)):
        if word:
            s.append((lang, word))
    return s


# Формирование списка пар язык-слово
data = list()
for i, row in df.iterrows():
    data += split_sentence(row['labels'], row['text'])
    
    
data[:5]

In [None]:
# Формивание датафрейма язык-слово и удаление повторений
df_new = pd.DataFrame(data=data, columns=['labels', 'word']).drop_duplicates()
df_new.head()

In [None]:
INPUT = 'обуч'

print(
    df_new[df_new['word'].str.contains(INPUT)]\
        .groupby('labels')\
        .count().T
)

In [None]:
# Априорные вероятности классов
# class_prior=[
#     0.04, 0.04, 0.05, 0.05, 0.1, 0.05, 0.05, 0.04, 0.05, 0.05,
#     0.05, 0.05, 0.05, 0.05, 0.05, 0.04, 0.04, 0.05, 0.05, 0.05
# ]
# class_prior=[0.05]*20

# Пострение модели классификации
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer='char', ngram_range=(2,4))),
    ('classifier', MultinomialNB(class_prior=class_prior))
])

# Обучение модели
pipeline.fit(df_new['word'], df_new['labels'])

print(
    # Оценка качества на тестовом множестве (из первой задачи)
    f"Accuracy = { pipeline.score(df_test['text'], df_test['labels']) }"
)

In [None]:
# pipeline.named_steps['vectorizer'].vocabulary_

In [None]:
# Классы
langs = pipeline.named_steps['classifier'].classes_
langs

In [None]:
INPUT = 'обуч'

# Вероятности принадлежности классам для некоторого слова
probs = pipeline.predict_proba([INPUT,])[0]
probs

### Проверка динамического распознавания 

In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output

In [None]:
def display_prediction(langs, probs):
    """
    Отображение вероятностей по языкам 
    в виде датафрейма.
    """
    print(
        pd.DataFrame(
            data=zip(langs, probs),
            columns=['lang', 'prob']
        )\
        .sort_values('prob', ascending=0)\
        .head(10)
    )

In [None]:
# Ввод текста
text_input = widgets.Text()
display(text_input)

# Вывод результата предсказания
output = widgets.Output()
display(output)


def handle_process_text(sender):
    with output:
        clear_output()
        probs = pipeline.predict_proba([sender.new,])[0]
        langs = pipeline.named_steps['classifier'].classes_
        display_prediction(langs, probs)


# Отслеживание ввода
text_input.observe(handle_process_text, names='value')