## Environment

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Basic
import os
import re
import sys
import glob
import numpy as np
import pandas as pd

import itertools
import more_itertools
from pathlib import Path
from pprint import pprint
from collections import defaultdict, Counter
from typing import (List, Dict, Any, NoReturn, 
                    Tuple, Optional, Union)
from tqdm import tqdm_notebook

import multiprocessing
from multiprocessing_logging import install_mp_handler

import logging
logging.basicConfig(level=logging.DEBUG,
                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

import warnings
warnings.filterwarnings('ignore')

print(f"Number CPU: {multiprocessing.cpu_count()}")

Number CPU: 12


In [3]:
# W2V
import spacy
import pymorphy2
from gensim.utils import any2utf8, to_utf8
from gensim.models import KeyedVectors
from gensim.models.phrases import Phrases
from nltk.stem.snowball import SnowballStemmer
from gensim.models.phrases import Phrases, npmi_scorer
from gensim.models import word2vec, keyedvectors

### Define paths

In [4]:
BASE_DIR = Path("..")
DATA_DIR  = BASE_DIR / "data"
LISTS_DIR  = BASE_DIR / "lists"
MODEL_DIR  = BASE_DIR / "models"

## Load data

In [5]:
df = []
for fn in tqdm_notebook(glob.glob(str(DATA_DIR / "*.csv"))):
    df_part = pd.read_csv(fn, sep=';', encoding='utf-8')
    print(f"Data part of shape: {df_part.shape}")
    df.append(df_part)
df = pd.concat(df).reset_index(drop=True)
print(f"All data of shape: {df.shape}")
df.head()

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

Data part of shape: (3944, 10)
Data part of shape: (9200, 10)

All data of shape: (13144, 10)


Unnamed: 0,incident,req_reason,req_reg_datetime,req_num,msg,product,subproduct,subject,s_subject,day_of_the_year
0,OTHER,Инцидент,2020-11-05 11:27:21,2011050726933001,здравствуйте здравствуйте я застрахованное лиц...,Страхование,.Страховой случай,Консультация по продуктам и обслуживанию,"Разъяснения (условия, сроки, статусы рассмотре...",[310]
1,OTHER,Инцидент,2020-11-19 15:27:04,2011190004665301,Куда #ТОПОНИМ мой аватар Ну если не понял то...,Онлайн-сервисы,Функционирование МБ/СБОЛ/МП,Работа в системе,Информация по возможностям/ограничениям,[324]
2,OTHER,Инцидент,2020-11-20 13:16:35,2011200083620001,меня зовут #ФИО здравствуйте здравствуйте у ...,Физ. лица - иные услуги/продукты,Кредитная история (БКИ),Вопросы по отчету,Отчет не поступил/поступил с ошибкой,[325]
3,OTHER,Инцидент,2020-11-21 19:29:17,2011210190303001,#ФИО здравствуйте добрый день я вас слушаю я...,Онлайн-сервисы,Функционирование МБ/СБОЛ/МП,Работа в системе,Информация по возможностям/ограничениям,[326]
4,IM0104549077,Инцидент,2020-11-20 13:40:55,2011200086706501,меня зовут #ФИО здравствуйте #ФИО добрый д...,Онлайн-сервисы,Функционирование МБ/СБОЛ/МП,Работа в системе,Некорректная работа Мобильного Приложения СБОЛ,[325]


## Configure preprocessing

In [6]:
def file_opener(filename: str) -> str:
    with open(filename, 'rt', encoding='utf-8-sig') as src:
        file = src.read()
    return "|".join([x for x in file.split('\n') if x])


class DataPreprocessorLemmatizer:

    text_features = ['msg']
    stopgrams = [
             'CONJ',   # союз
             'PRCL',   # частица
             'PRED',   # предикатив
             'NPRO',   # местоимение-сущ.
             'INTJ',   # междометие
             'Erro',   # ошибка
             'Dist',   # искажение
             'Ques',   # вопросительное слово
             'Dmns',   # указательное слово
             'Prnt'   # вводное слово
            ]

    def __init__(self, multipocess: bool, num_processors: int=16, chunksize: int=100, 
                intro_words_path: str = './lists'):
        # Language parsers
        self.__morph = pymorphy2.MorphAnalyzer()
        self.__nlp = spacy.blank('ru')
        # Multiprocessing params
        self.__multipocess = multipocess
        self.__num_processors = num_processors
        self.__chunksize = chunksize
        # Cleaning utils
        self.__intro_words = file_opener(os.path.join(intro_words_path, 'intro.txt'))
        self.__nltk_stopwords = file_opener(os.path.join(intro_words_path, 'NLTK_stopwords.txt')).split("|")

    def get_stopwords(self):
        """ 
        Check intro-words list.
        """
        return self.__nltk_stopwords

    def get_intro_words(self):
        """ 
        Check intro-words list.
        """
        return self.__intro_words
    
    def get_analyzer(self):
        """
        Allow to access to Pymorphy Analyzer instance.
        """
        return self.__morph

    
    def _process_text(self, text: str):
        """ 
        Process single text and return list of tokens.
        """
        if pd.isna(text):
            return []
       # Pre-processing part 
        text = [str(token).lower()
                for token in self.__nlp.make_doc(text)
                if (token and token.is_alpha and len(str(token.text)) > 2 and ~token.is_stop)]
        # Processing part
        clean_text = []
        for token in text:
            token = self.__morph.parse(str(token).lower())[0]
            if ((token.normal_form not in self.__nltk_stopwords) 
                and (token.normal_form not in self.__intro_words)
                and all([tag not in token.tag for tag in self.stopgrams])):
                clean_text.append(token.normal_form)
        return clean_text
    

    def process_texts(self, texts: List[str]):
        """ 
        Process list of texts and return list of lists of tokens.
        """
        if self.__multipocess:
            with multiprocessing.Pool(self.__num_processors) as pool:
                processed_texts = list(tqdm_notebook(pool.imap(self._process_text, texts, 
                                                               chunksize=self.__chunksize), 
                                                     total=len(texts)))
            return processed_texts
        else:
            return [self._process_text(text) for text in tqdm_notebook(texts)]
        


    def process(self, data: pd.DataFrame,
                features_cols: Optional[List[str]] = None, copy: bool=True) -> pd.DataFrame:
        """
        Preprocess text for language modelling.
         - clean introduction words, numbers and small prefixes;
         - tokenize and lemmatize texts;
        """
        logging.info("Text processing started.")
        if not features_cols:
            features_cols = self.text_features

        for col_name in features_cols:
            logging.info(f"Processing '{col_name}' column...")
            data_processed = self.process_texts(data[col_name].fillna("").to_list())
            if not copy:
                data[col_name] = data_processed
            else:
                data[col_name + "_proc"] = data_processed

        logging.info("Text preprocessing finished.")
        return data

In [7]:
processor = DataPreprocessorLemmatizer(multipocess=False,
                                       num_processors=8, chunksize=200,
                                       intro_words_path=str(LISTS_DIR))

2021-07-19 20:26:54,013 - pymorphy2.opencorpora_dict.wrapper - INFO - Loading dictionaries from C:\Users\airen\Anaconda3\envs\pycharmenv\lib\site-packages\pymorphy2_dicts\data
2021-07-19 20:26:54,082 - pymorphy2.opencorpora_dict.wrapper - INFO - format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168
2021-07-19 20:26:54,101 - pymorphy2.opencorpora_dict.wrapper - INFO - Loading dictionaries from C:\Users\airen\Anaconda3\envs\pycharmenv\lib\site-packages\pymorphy2_dicts\data
2021-07-19 20:26:54,141 - pymorphy2.opencorpora_dict.wrapper - INFO - format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


In [8]:
%%time
df = processor.process(data=df, features_cols=['msg'])

2021-07-19 20:26:54,365 - root - INFO - Text processing started.
2021-07-19 20:26:54,366 - root - INFO - Processing 'msg' column...


HBox(children=(FloatProgress(value=0.0, max=13144.0), HTML(value='')))

2021-07-19 20:33:07,709 - root - INFO - Text preprocessing finished.



Wall time: 6min 13s


In [9]:
for i, row in df[['msg', 'msg_proc']].head().iterrows():
    print("--------------")
    print(f"{row['msg']}\n{row['msg_proc']}")

--------------
здравствуйте здравствуйте я застрахованное лицо я хотела бы узнать договор полиса и дата оплаты страховой премии мне по поводу кража у меня украли  #ЧИСЛО  тысяч мошенники я как бы чтоб страховой написать заявление то есть подать на страховой случай я правильно понимаю да да да мне должны указать договор полис я по телефону это все как то оплачивала договор вам оплатить нужно правильно понимаю нет у меня мошенники сняли  #ЧИСЛО  тысяч и как бы я хочу в страховую службу написать заявление мне нужно указать договор полис и дата оплаты страховой премии когда оплачивалась то есть вам нужен номер договора да номер договора мы не сообщаем как же мне заявление для по поводу кражи вы сказали вам звонить в сбербанк щас сижу так одна минута деньги  #ЧИСЛО  когда позвонит еще послан от обратиться этот номер договора номер договора говорит не сообщаем а потом и одну минуточку подождите не сообщаем а потом что мне делать и для документации не ладно сейчас минуточку подождите сейчас в

In [10]:
class Bigrammer:
    
    def __init__(self, phrase_model=None):
        self.__phrase_model = phrase_model
    
    def train(self, texts: List[List[str]], 
              min_count: int, threshold: float,
             to_save: bool, save_path: str=".", phrases_fn: str="phrases.pkl"):
        """
        Train gensim Phrases model with NPMI scorer.
        :param texts - The training corpus must be a sequence of sentences,
                        with each sentence a list of tokens.
        :param min_count – Ignore all words and bigrams with total 
                            collected count lower than this value.
        :param threshold – Represent a score threshold for forming 
                            the phrases (higher means fewer phrases). 
                            A phrase of words a followed by b is accepted if the score of 
                            the phrase is greater than threshold. 
                            For NPMI scorer is in the range -1 to 1.
        """
        logging.info("Training bigrammer started.")
        self.__phrase_model = Phrases(texts, min_count=min_count, 
                               threshold=threshold, scoring='npmi')
        logging.info("Training bigrammer finished.")
        if to_save:
            self.__phrase_model.save(os.path.join(save_path, phrases_fn))
            if os.path.isfile(os.path.join(save_path, phrases_fn)):
                logging.info(f"Bigrammer model successfully saved to: {os.path.join(save_path, phrases_fn)}")
        return self
    
    @classmethod
    def load(cls, save_path: str, phrases_fn: str) -> object:
        """
        Load pre-trained model from file and init.
        """
        if os.path.isfile(os.path.join(save_path, phrases_fn)):
            logging.info(f"Bigrammer model loading from: {os.path.join(save_path, phrases_fn)}")
        phrase_model = Phrases.load(os.path.join(save_path, phrases_fn))
        logging.info(f"Bigrammer model successfully loaded.")
        return cls(phrase_model=phrase_model)
    
    
    def create_bigramms(self, texts: List[List[str]]) -> List[List[str]]:
        """
        Create bi-gramms from given text data, already splitted.
        """
        return [self.__phrase_model[text] if len(text) > 0 
                else [] for text in tqdm_notebook(texts)]
    
    
    def process(self, data: pd.DataFrame,
                text_col: str, copy: bool=True) -> pd.DataFrame:
        """
        Create bi-gramms from given column in dataframe.
        """
        logging.info(f"Bigramms creation for texts in column {text_col} started")
        data_processed = self.create_bigramms(data[text_col].fillna("").to_list())
        if not copy:
            data[text_col] = data_processed
        else:
            data[text_col + "_bigramms"] = data_processed

        logging.info("Bigramms creation finished.")
        return data
    
    
    def get_vocab(self) -> Dict[bytes, int]:
        logging.info(f"Bigrammer vocab size: {len(self.__phrase_model.vocab)}")
        return self.__phrase_model.vocab
    
    
    def get_phraser(self) -> Phrases:
        return self.__phrase_model

In [11]:
bigrammer = Bigrammer().train(df['msg_proc'].to_list(),
                             min_count=5, threshold=0.3,
                             to_save=False)
bi_vocab = bigrammer.get_vocab()

2021-07-19 20:33:08,169 - root - INFO - Training bigrammer started.
2021-07-19 20:33:08,174 - gensim.models.phrases - INFO - collecting all words and their counts
2021-07-19 20:33:08,174 - gensim.models.phrases - INFO - PROGRESS: at sentence #0, processed 0 words and 0 word types
2021-07-19 20:33:09,567 - gensim.models.phrases - INFO - PROGRESS: at sentence #10000, processed 873920 words and 251328 word types
2021-07-19 20:33:10,005 - gensim.models.phrases - INFO - collected 303577 word types from a corpus of 1148128 words (unigram + bigrams) and 13144 sentences
2021-07-19 20:33:10,006 - gensim.models.phrases - INFO - using 303577 counts as vocab in Phrases<0 vocab, min_count=5, threshold=0.3, max_vocab_size=40000000>
2021-07-19 20:33:10,006 - root - INFO - Training bigrammer finished.
2021-07-19 20:33:10,007 - root - INFO - Bigrammer vocab size: 303577


In [12]:
bigramms = [k for k in bi_vocab.keys() if "_" in k.decode('utf-8')]
print(f"Bigramms in vocab: {len(bigramms)} from {len(bi_vocab)} which is {100*len(bigramms)/len(bi_vocab)}\n")

for i, (k, v) in enumerate(bi_vocab.items()):
    print(f"{k.decode('utf-8')} --> {v}")
    if i > 10:
        break

Bigramms in vocab: 288359 from 303577 which is 94.98710376609559

застраховать --> 135
лицо --> 733
застраховать_лицо --> 7
хотеть --> 7220
лицо_хотеть --> 4
узнать --> 2319
хотеть_узнать --> 1013
договор --> 681
узнать_договор --> 2
полис --> 516
договор_полис --> 4
дата --> 2175


In [13]:
%%time
df = bigrammer.process(df, text_col='msg_proc', copy=True)

2021-07-19 20:33:10,380 - root - INFO - Bigramms creation for texts in column msg_proc started


HBox(children=(FloatProgress(value=0.0, max=13144.0), HTML(value='')))

2021-07-19 20:33:14,485 - root - INFO - Bigramms creation finished.



Wall time: 4.11 s


In [14]:
for i, row in df[['msg', 'msg_proc_bigramms']].head().iterrows():
    print("--------------")
    print(f"{row['msg']}\n{row['msg_proc_bigramms']}")

--------------
здравствуйте здравствуйте я застрахованное лицо я хотела бы узнать договор полиса и дата оплаты страховой премии мне по поводу кража у меня украли  #ЧИСЛО  тысяч мошенники я как бы чтоб страховой написать заявление то есть подать на страховой случай я правильно понимаю да да да мне должны указать договор полис я по телефону это все как то оплачивала договор вам оплатить нужно правильно понимаю нет у меня мошенники сняли  #ЧИСЛО  тысяч и как бы я хочу в страховую службу написать заявление мне нужно указать договор полис и дата оплаты страховой премии когда оплачивалась то есть вам нужен номер договора да номер договора мы не сообщаем как же мне заявление для по поводу кражи вы сказали вам звонить в сбербанк щас сижу так одна минута деньги  #ЧИСЛО  когда позвонит еще послан от обратиться этот номер договора номер договора говорит не сообщаем а потом и одну минуточку подождите не сообщаем а потом что мне делать и для документации не ладно сейчас минуточку подождите сейчас в

## TextRank

In [16]:
import networkx as nx

In [29]:
morph = pymorphy2.MorphAnalyzer()
type(morph.parse("облако")[0].tag)

2021-07-19 21:13:36,417 - pymorphy2.opencorpora_dict.wrapper - INFO - Loading dictionaries from C:\Users\airen\Anaconda3\envs\pycharmenv\lib\site-packages\pymorphy2_dicts\data
2021-07-19 21:13:36,459 - pymorphy2.opencorpora_dict.wrapper - INFO - format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


pymorphy2.tagset.OpencorporaTag

In [30]:
class Candidate(object):
    """
    The keyphrase candidate data structure.
    """

    def __init__(self, token: str, 
                 is_bigramm: bool,
                 pos: pymorphy2.tagset.OpencorporaTag):

        self._token = token
        self._is_bigramm = is_bigramm
        self._lexical_forms = []  # .inflect('падеж')
        self._n_usages = []  # count usages or/and tfidf
        # the Part-Of-Speech
        self._pos = pos

In [None]:
class TextRanker:
    """
    TextRank for keyword extraction.
    This model builds a graph that represents the text. A graph based ranking
    algorithm is then applied to extract the lexical units (here the words) that
    are most important in the text.
    In this implementation, 
     - nodes - are words of certain part-of-speech (nouns/adjectives/..) 
     - edges - represent co-occurrence relation, controlled by the distance 
               between word occurrences - a window of N words). 
    """

    def __init__(self, ):
        # Russian language parser
        self.__morph = pymorphy2.MorphAnalyzer()
        # Words graph
        self.__graph = nx.Graph()
        self.__texts = []  # as List[List[Dict[str, Any]]] -> [[{'words': [], 'bigramm': [], 'POS': []}]]
        # Each inner Dict == single token
        # Each inner List == single text
        # Outer List is composition of texts
        
    def candidate_weighting(self, texts: List[List[str]],
                            window: int=2, pos_list: List[str]=None,
                            include_bigramms: bool=True,
                            top_percent: float=None):
        """
        Tailored candidate ranking method for TextRank. 
        Keyphrase candidates are either composed from the T-percent (top_percent) 
        highest-ranked words or extracted using the `candidate_selection()` method.
        Candidates are ranked using the sum of their words.
        :param window - the window for connecting words in the graph.
        :param pos_list - the set of valid pos for words to be considered as nodes
                    in the graph, defaults to ('NOUN', 'PROPN', 'ADJ').
        :param top_percent - percentage of top vertices to keep for phrase generation.
        """
        # flatten document as a sequence of (word, bigramm, pos) samples
        self.__texts = self.__tag_words(texts)
        self.__window = window
        self.__include_bigramms = include_bigramms
        if (pos_list is None) and ~include_bigramms:
            # From pymorphy2 avaliable POS tags
            # ref: http://opencorpora.org/dict.php?act=gram 
            self.__pos_list = ['NOUN', 'ADJS', 'ADJF', 'COMP', 'VERB', 'INFN', 
                               'PRTF', 'PRTS', 'GRND', 'NUMR', 'ADVB', 'Abbr']
        else:
            self.__pos_list = None
            
        self.__build_word_graph()
        
        # Computes the word scores using the unweighted PageRank formula
        # pagerank_scipy() is a SciPy sparse-matrix implementation of the power-method
        # Returns: pagerank – Dictionary of nodes with PageRank as value
        textranked = nx.pagerank_scipy(self.__graph, alpha=0.85, tol=0.0001, weight=None)
        
        # Generate the phrases from the T-percent top ranked words
        if top_percent is not None:

            # warn user as this is not the pke way of doing it
            logging.info(f"Candidates are generated using {top_percent}%-top")

            # computing the number of top keywords
            n_nodes = self.__graph.number_of_nodes()
            to_keep = min(int(n_nodes * top_percent), n_nodes)

            # Sorting the nodes by decreasing scores
            top_words = sorted(textranked, key=textranked.get, reverse=True)
    
    
    def __check_validness(self, token_dict: Dict[str, Any]) -> bool:
        return token_dict['bigramm'] or any([tag in token_dict['bigramm'] 
                                         for tag in self.__pos_list])
        
        
    def __tag_words(self, texts: List[List[str]]):
        """
        Process given texts to selected form: 
        [[{'words': [], 'bigramm': [], 'POS': [], 'valid'}]]
        """
        texts = [[{'token': token,
                  'bigramm': True if "_" in token else False,
                  'pos': self.__morph.parse(str(token).lower())[0].tag if "_" in token else None}
                 for token in tokens] 
                for tokens in tqdm_notebook(texts)]
        _ = [[token.update({'valid': self.__check_validness(token)})  
                 for token in tokens] for tokens in tqdm_notebook(texts)]
        return texts
            
            
    def __build_word_graph(self):
        """
        Build a graph representation of the document in which nodes/vertices
        are words and edges represent co-occurrence relation. Syntactic filters
        can be applied to select only words of certain Part-of-Speech.
        Co-occurrence relations can be controlled using the distance between
        word occurrences in the document.
        """
        tokens = itertools.chain.from_iterable(self.__texts)
        # add nodes to the graph
        logging.info(f"Adding nodes to graph...")
        self.__graph.add_nodes_from([token['token'] for token in tokens if token['valid']])

        # add edges to the graph
        logging.info(f"Adding edges...")
        for text_i, tokens in enumerate(self.__texts):
            for token_i, token in enumerate(tokens):
                # speed up things
                if not token['valid']:
                    continue
                start_ind = min(token_i, (self.__window - 1) // 2)
                end_ind = min(i + self.__window, len(tokens))
                for j in range(start_ind, end_ind):
                    linked_token = tokens[j]
                    if linked_token['valid'] and linked_token['token'] != token['token']:
                        self.__graph.add_edge(token['token'], linked_token['token'])
        logging.info(f"Adding edges ended.")

In [17]:
G = nx.gnp_random_graph(1000, 0.01, directed=True)
G

<networkx.classes.digraph.DiGraph at 0x1e243c06710>

In [23]:
%%time
w = nx.pagerank_scipy(G, tol=1e-10)
w

Wall time: 9.97 ms


{0: 0.0008596369609779237,
 1: 0.0013235735672713308,
 2: 0.0009984612162777218,
 3: 0.0006760379294151492,
 4: 0.0007348969309069707,
 5: 0.0010818718834806243,
 6: 0.0009037926430497324,
 7: 0.001033352754038597,
 8: 0.000681372255453699,
 9: 0.0010190225533670788,
 10: 0.0012055966220577948,
 11: 0.0013952564755086326,
 12: 0.0008730918079810032,
 13: 0.001186692327227081,
 14: 0.0014526663474476017,
 15: 0.0010080140692434896,
 16: 0.0006043108481673653,
 17: 0.0012799230815051,
 18: 0.0007510013276397307,
 19: 0.000415246633395767,
 20: 0.0006673154064051989,
 21: 0.0009693746369961698,
 22: 0.0009237599592792048,
 23: 0.0011877573955067129,
 24: 0.0007803475658349393,
 25: 0.0012286975898455122,
 26: 0.0011334374745267755,
 27: 0.0009150107319958528,
 28: 0.0012547797432441183,
 29: 0.001076909237058662,
 30: 0.0007140034900908927,
 31: 0.0013318407252283046,
 32: 0.0011584323087902252,
 33: 0.0008328396810830985,
 34: 0.0008019053986958709,
 35: 0.0012036098083129001,
 36: 0.001

In [24]:
top_words = sorted(w, key=w.get, reverse=True)
top_words

[858,
 642,
 543,
 519,
 510,
 60,
 620,
 353,
 587,
 178,
 306,
 921,
 163,
 500,
 989,
 475,
 940,
 312,
 130,
 797,
 349,
 405,
 726,
 871,
 777,
 702,
 149,
 821,
 151,
 448,
 657,
 258,
 369,
 640,
 69,
 227,
 668,
 976,
 351,
 653,
 73,
 313,
 191,
 789,
 844,
 122,
 795,
 634,
 646,
 669,
 616,
 591,
 502,
 845,
 333,
 672,
 118,
 157,
 425,
 574,
 99,
 712,
 972,
 272,
 75,
 14,
 389,
 265,
 832,
 929,
 814,
 674,
 83,
 88,
 63,
 486,
 52,
 780,
 983,
 662,
 92,
 54,
 499,
 916,
 198,
 11,
 761,
 827,
 597,
 399,
 160,
 259,
 162,
 179,
 437,
 880,
 964,
 115,
 529,
 740,
 370,
 865,
 754,
 831,
 476,
 64,
 590,
 738,
 183,
 708,
 450,
 757,
 593,
 941,
 433,
 857,
 436,
 66,
 483,
 808,
 783,
 690,
 262,
 146,
 104,
 469,
 285,
 956,
 44,
 579,
 428,
 320,
 31,
 106,
 798,
 45,
 819,
 59,
 97,
 575,
 735,
 876,
 1,
 240,
 567,
 525,
 303,
 121,
 464,
 505,
 892,
 145,
 407,
 660,
 986,
 881,
 268,
 58,
 658,
 292,
 722,
 755,
 614,
 231,
 728,
 453,
 860,
 807,
 820,
 17,
 905

In [27]:
{k: v for k, v in sorted(w.items(), key=lambda item: item[1], reverse=True)}

{858: 0.0020228268029577395,
 642: 0.0019707088479206224,
 543: 0.0019635792925777676,
 519: 0.0018531445978644515,
 510: 0.0018280287182492639,
 60: 0.0017967235736833112,
 620: 0.0017967016088628767,
 353: 0.0017788094513599814,
 587: 0.0017743065932120628,
 178: 0.0017712824104539116,
 306: 0.0017533802065621412,
 921: 0.0017459212953042493,
 163: 0.0017456721043217407,
 500: 0.001734803889615909,
 989: 0.0017285331477136633,
 475: 0.0017264744251185564,
 940: 0.0017090105266609433,
 312: 0.0016845134351731795,
 130: 0.0016829897508939496,
 797: 0.001681175702018977,
 349: 0.0016781683298716385,
 405: 0.0016684365445887644,
 726: 0.001668168375013124,
 871: 0.0016536798318014894,
 777: 0.0016529459462941707,
 702: 0.0016412515018291202,
 149: 0.0016373907825124622,
 821: 0.00163598329897522,
 151: 0.001630156518479332,
 448: 0.0016237138703653839,
 657: 0.0016181009597831296,
 258: 0.0016116292106432568,
 369: 0.0016061692728100683,
 640: 0.001569293391819715,
 69: 0.001565402103673