## Environment

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Basic
import os
import re
import sys
import glob
import numpy as np
import pandas as pd

from pathlib import Path
from pprint import pprint
from collections import defaultdict, Counter
from typing import (List, Dict, Any, NoReturn, 
                    Tuple, Optional, Union)
from tqdm import tqdm_notebook

import multiprocessing
from multiprocessing_logging import install_mp_handler

import logging
logging.basicConfig(level=logging.DEBUG,
                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

import warnings
warnings.filterwarnings('ignore')

print(f"Number CPU: {multiprocessing.cpu_count()}")

Number CPU: 12


In [3]:
# W2V
import spacy
import pymorphy2
from gensim.models import KeyedVectors
from gensim.models.phrases import Phrases
from nltk.stem.snowball import SnowballStemmer
from gensim.models.phrases import Phrases, npmi_scorer
from gensim.models import word2vec, keyedvectors

### Define paths

In [4]:
BASE_DIR = Path("..")
DATA_DIR  = BASE_DIR / "data"
LISTS_DIR  = BASE_DIR / "lists"
MODEL_DIR  = BASE_DIR / "models"

## Load data

In [5]:
df = []
for fn in tqdm_notebook(glob.glob(str(DATA_DIR / "*.csv"))):
    df_part = pd.read_csv(fn, sep=';', encoding='utf-8')
    print(f"Data part of shape: {df_part.shape}")
    df.append(df_part)
df = pd.concat(df).reset_index(drop=True)
print(f"All data of shape: {df.shape}")
df.head()

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

Data part of shape: (3944, 10)
Data part of shape: (9200, 10)

All data of shape: (13144, 10)


Unnamed: 0,incident,req_reason,req_reg_datetime,req_num,msg,product,subproduct,subject,s_subject,day_of_the_year
0,OTHER,Инцидент,2020-11-05 11:27:21,2011050726933001,здравствуйте здравствуйте я застрахованное лиц...,Страхование,.Страховой случай,Консультация по продуктам и обслуживанию,"Разъяснения (условия, сроки, статусы рассмотре...",[310]
1,OTHER,Инцидент,2020-11-19 15:27:04,2011190004665301,Куда #ТОПОНИМ мой аватар Ну если не понял то...,Онлайн-сервисы,Функционирование МБ/СБОЛ/МП,Работа в системе,Информация по возможностям/ограничениям,[324]
2,OTHER,Инцидент,2020-11-20 13:16:35,2011200083620001,меня зовут #ФИО здравствуйте здравствуйте у ...,Физ. лица - иные услуги/продукты,Кредитная история (БКИ),Вопросы по отчету,Отчет не поступил/поступил с ошибкой,[325]
3,OTHER,Инцидент,2020-11-21 19:29:17,2011210190303001,#ФИО здравствуйте добрый день я вас слушаю я...,Онлайн-сервисы,Функционирование МБ/СБОЛ/МП,Работа в системе,Информация по возможностям/ограничениям,[326]
4,IM0104549077,Инцидент,2020-11-20 13:40:55,2011200086706501,меня зовут #ФИО здравствуйте #ФИО добрый д...,Онлайн-сервисы,Функционирование МБ/СБОЛ/МП,Работа в системе,Некорректная работа Мобильного Приложения СБОЛ,[325]


## Configure preprocessing

In [11]:
def file_opener(filename: str) -> str:
    with open(filename, 'rt', encoding='utf-8-sig') as src:
        file = src.read()
    return "|".join([x for x in file.split('\n') if x])


class DataPreprocessorLemmatizer:

    text_features = ['msg']
    stopgrams = [
             'CONJ',   # союз
             'PRCL',   # частица
             'PRED',   # предикатив
             'NPRO',   # местоимение-сущ.
             'INTJ',   # междометие
             'Erro',   # ошибка
             'Dist',   # искажение
             'Ques',   # вопросительное слово
             'Dmns',   # указательное слово
             'Prnt'   # вводное слово
            ]

    def __init__(self, multipocess: bool, num_processors: int=16, chunksize: int=100, 
                intro_words_path: str = './lists'):
        # Language parsers
        self.__morph = pymorphy2.MorphAnalyzer()
        self.__nlp = spacy.blank('ru')
        # Multiprocessing params
        self.__multipocess = multipocess
        self.__num_processors = num_processors
        self.__chunksize = chunksize
        # Cleaning utils
        self.__intro_words = file_opener(os.path.join(intro_words_path, 'intro.txt'))
        self.__nltk_stopwords = file_opener(os.path.join(intro_words_path, 'NLTK_stopwords.txt')).split("|")

    def get_stopwords(self):
        """ 
        Check intro-words list.
        """
        return self.__nltk_stopwords

    def get_intro_words(self):
        """ 
        Check intro-words list.
        """
        return self.__intro_words
    
    def get_analyzer(self):
        """
        Allow to access to Pymorphy Analyzer instance.
        """
        return self.__morph

    
    def _process_text(self, text: str):
        """ 
        Process single text and return list of tokens.
        """
        if pd.isna(text):
            return []
       # Pre-processing part 
        text = [str(token).lower()
                for token in self.__nlp.make_doc(text)
                if (token and token.is_alpha and len(str(token.text)) > 2 and ~token.is_stop)]
        # Processing part
        clean_text = []
        for token in text:
            token = self.__morph.parse(str(token).lower())[0]
            if ((token.normal_form not in self.__nltk_stopwords) 
                and (token.normal_form not in self.__intro_words)
                and all([tag not in token.tag for tag in self.stopgrams])):
                clean_text.append(token.normal_form)
        return clean_text
    

    def process_texts(self, texts: List[str]):
        """ 
        Process list of texts and return list of lists of tokens.
        """
        if self.__multipocess:
            with multiprocessing.Pool(self.__num_processors) as pool:
                processed_texts = list(tqdm_notebook(pool.imap(self._process_text, texts, 
                                                               chunksize=self.__chunksize), 
                                                     total=len(texts)))
            return processed_texts
        else:
            return [self._process_text(text) for text in tqdm_notebook(texts)]
        


    def process(self, data: pd.DataFrame,
                features_cols: Optional[List[str]] = None, copy: bool=True) -> pd.DataFrame:
        """
        Preprocess text for language modelling.
         - clean introduction words, numbers and small prefixes;
         - tokenize and lemmatize texts;
        """
        logging.info("Text processing started.")
        if not features_cols:
            features_cols = self.text_features

        for col_name in features_cols:
            logging.info(f"Processing '{col_name}' column...")
            data_processed = self.process_texts(data[col_name].fillna("").to_list())
            if not copy:
                data[col_name] = data_processed
            else:
                data[col_name + "_proc"] = data_processed

        logging.info("Text preprocessing finished.")
        return data

In [12]:
processor = DataPreprocessorLemmatizer(multipocess=False,
                                       num_processors=8, chunksize=200,
                                       intro_words_path=str(LISTS_DIR))

2021-07-16 20:25:39,195 - pymorphy2.opencorpora_dict.wrapper - INFO - Loading dictionaries from C:\Users\airen\Anaconda3\envs\pycharmenv\lib\site-packages\pymorphy2_dicts\data
2021-07-16 20:25:39,239 - pymorphy2.opencorpora_dict.wrapper - INFO - format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


In [13]:
%%time
df = processor.process(data=df, features_cols=['msg'])

2021-07-16 20:25:42,439 - root - INFO - Text processing started.
2021-07-16 20:25:42,440 - root - INFO - Processing 'msg' column...


HBox(children=(FloatProgress(value=0.0, max=13144.0), HTML(value='')))

2021-07-16 20:31:09,234 - root - INFO - Text preprocessing finished.



Wall time: 5min 26s


In [14]:
for i, row in df[['msg', 'msg_proc']].head().iterrows():
    print("--------------")
    print(f"{row['msg']}\n{row['msg_proc']}")

--------------
здравствуйте здравствуйте я застрахованное лицо я хотела бы узнать договор полиса и дата оплаты страховой премии мне по поводу кража у меня украли  #ЧИСЛО  тысяч мошенники я как бы чтоб страховой написать заявление то есть подать на страховой случай я правильно понимаю да да да мне должны указать договор полис я по телефону это все как то оплачивала договор вам оплатить нужно правильно понимаю нет у меня мошенники сняли  #ЧИСЛО  тысяч и как бы я хочу в страховую службу написать заявление мне нужно указать договор полис и дата оплаты страховой премии когда оплачивалась то есть вам нужен номер договора да номер договора мы не сообщаем как же мне заявление для по поводу кражи вы сказали вам звонить в сбербанк щас сижу так одна минута деньги  #ЧИСЛО  когда позвонит еще послан от обратиться этот номер договора номер договора говорит не сообщаем а потом и одну минуточку подождите не сообщаем а потом что мне делать и для документации не ладно сейчас минуточку подождите сейчас в

In [None]:
morph = pymorphy2.MorphAnalyzer()

In [16]:
class Bigrammer:
    def __init__(self, phrase_model=None):
        self.__phrase_model = phrase_model
    
    def train(self, texts: List[List[str]], 
              min_count: int, threshold: float,
             to_save: bool, save_path: str, phrases_fn: str):
        self.__phrase_model = Phrases(texts, min_count=min_count, 
                               threshold=threshold, scoring='npmi')
        if to_save:
            self.__phrase_model.save(os.path.join(save_path, phrases_fn))
    
    @classmethod
    def load(cls, save_path: str, phrases_fn: str) -> object:
        """
        Load pre-trained model from file and init.
        """
        phrase_model = Phrases.load(os.path.join(save_path, phrases_fn))
        return cls(phrase_model=phrase_model)
    
    
    def create_bigramms(self, texts: List[List[str]], max_len: int=150) -> List[List[str]]:
        """
        Create bi-gramms from given text data, already splitted.
        """
        return [self.__phrase_model[text] if len(text) > 0 else np.zeros((max_len,)) for text in texts]
    
    
    def get_vocab(self):
        return self.__phrase_model.vocab
    
    
    def get_phraser(self):
        return self.__phrase_model

## TextRank

In [10]:
import networkx as nx

In [None]:
class TextRanker:
    """
    TextRank for keyword extraction.
    This model builds a graph that represents the text. A graph based ranking
    algorithm is then applied to extract the lexical units (here the words) that
    are most important in the text.
    In this implementation, 
     - nodes - are words of certain part-of-speech (nouns/adjectives/..) 
     - edges - represent co-occurrence relation, controlled by the distance 
               between word occurrences - a window of N words). 
    """

    def __init__(self, ):
        # Russian language parser
        self.__morph = pymorphy2.MorphAnalyzer()
        # Words graph
        self.__graph = nx.Graph()
        self.__texts = []  # as List[Dict[str, Any]] -> [{'words': [], 'POS': []}]
        # Each inner Dict == single text
        # Outer List is composition of texts
        
    def candidate_weighting(self, texts: List[List[str]],
                            window: int=2, pos_list: List[str]=None,
                            include_bigramms: bool=True,
                            top_percent: float=None):
        """
        Tailored candidate ranking method for TextRank. 
        Keyphrase candidates are either composed from the T-percent (top_percent) 
        highest-ranked words or extracted using the `candidate_selection()` method.
        Candidates are ranked using the sum of their words.
        :param window - the window for connecting words in the graph.
        :param pos_list - the set of valid pos for words to be considered as nodes
                    in the graph, defaults to ('NOUN', 'PROPN', 'ADJ').
        :param top_percent - percentage of top vertices to keep for phrase generation.
        """
        self.__texts = texts
        self.__window = window
        self.__include_bigramms = include_bigramms
        if (pos_list is None) and ~include_bigramms:
            # From pymorphy2 avaliable POS tags
            # ref: http://opencorpora.org/dict.php?act=gram 
            self.__pos_list = ['NOUN', 'ADJS', 'ADJF', 'COMP', 'VERB', 'INFN', 
                               'PRTF', 'PRTS', 'GRND', 'NUMR', 'ADVB', 'Abbr']
        else:
            self.__pos_list = None
            
        self.__build_word_graph()
        
    def __tag_words(self, texts):
        return [{'tokens': tokens, 
                 'pos': [self.__morph.parse(str(token).lower())[0].tag 
                         for token in text]} 
                for tokens in tqdm_notebook(texts)]
        
            
            
    def __build_word_graph(self):
        """
        Build a graph representation of the document in which nodes/vertices
        are words and edges represent co-occurrence relation. Syntactic filters
        can be applied to select only words of certain Part-of-Speech.
        Co-occurrence relations can be controlled using the distance between
        word occurrences in the document.
        """
        if ~(pos_list is None) and ~include_bigramms:
            # flatten document as a sequence of (word, pass_syntactic_filter) tuples
            text = [(word, sentence.pos[i] in pos) for sentence in self.sentences
                    for i, word in enumerate(self.__texts)]

        # add nodes to the graph
        self.graph.add_nodes_from([word for word, valid in text if valid])

        # add edges to the graph
        for i, (node1, is_in_graph1) in enumerate(text):

            # speed up things
            if not is_in_graph1:
                continue

            for j in range(i + 1, min(i + window, len(text))):
                node2, is_in_graph2 = text[j]
                if is_in_graph2 and node1 != node2:
                    self.graph.add_edge(node1, node2)

In [None]:
morph = pymorphy2.MorphAnalyzer()
morph.parse("")