In [1]:
import json
import pymorphy2
analyzer = pymorphy2.MorphAnalyzer()

In [2]:
with open("galaxy_words.json", encoding='utf-8') as f:
    dict_w = json.load(f)

Создадим словарь с токенами и их частями речи и запишем его в файл json

In [3]:
words_speech = {}
for num in range(len(dict_w)):
    lst = []
    for word in dict_w[str(num)]['words']:
        if analyzer.parse(word)[0].tag.POS is not None:
            lst.append((word, analyzer.parse(word)[0].tag.POS))
            words_speech[str(num)] = {'words_and_pos': lst}

In [4]:
with open('words_with_pos.json', 'w', encoding='utf-8') as file:
    json.dump(words_speech, file, indent=4, ensure_ascii=False)

Посчитаем средние доли для разных частей речи в разных документах

In [5]:
from collections import Counter

Создадим словарь с общим количеством каждой части речи для каждого документа

In [6]:
speech = {}
for i in list(words_speech.keys()):
    list_of_w = words_speech[i]['words_and_pos']
    list_of_speech = []
    for j in range(len(list_of_w)):
        if list_of_w[j][1] is not None:
            list_of_speech.append(list_of_w[j][1])
    speech[i] = dict(Counter(list_of_speech))

In [7]:
speech['0']

{'NOUN': 79,
 'ADJF': 36,
 'PRTS': 3,
 'CONJ': 1,
 'ADVB': 3,
 'VERB': 13,
 'PREP': 2,
 'PRCL': 2,
 'GRND': 1,
 'PRTF': 1}

Создадим датасет со средними долями каждой части речи в каждом документе

In [8]:
sp = []
for i in list(speech.keys()):
    for j in speech[str(i)].keys():
        sp.append(j)

In [9]:
unique_speech = list(set(sp))
print(unique_speech)

['PREP', 'COMP', 'ADVB', 'GRND', 'NOUN', 'NUMR', 'ADJS', 'PRED', 'VERB', 'ADJF', 'INTJ', 'PRTF', 'PRCL', 'NPRO', 'PRTS', 'INFN', 'CONJ']


In [10]:
data = {'NOUN': [], 'PRTF': [], 'PRCL': [], 'NRPO': [],
             'PRED': [], 'PRTS': [],'ADJS': [], 'INTJ': [],
             'PREP': [], 'GRND': [], 'NUMR': [], 'CONJ': [], 
            'ADVB': [], 'VERB': [], 'INFN': [], 'ADJF': [], 'COMP': []}

In [11]:
import pandas as pd
data_df = pd.DataFrame(data)

In [12]:
all = {}
for i in list(speech.keys()):
    count = 0
    for j in list(speech[str(i)].keys()):
        count += speech[str(i)][str(j)]
    new_row = {}
    for j in list(speech[str(i)].keys()):
        mean = speech[str(i)][str(j)] / count
        new_row.update({str(j): round(mean,2)})
    data_df = data_df.append(new_row, ignore_index=True)

In [13]:
data_df.fillna(0, inplace=True)

In [14]:
data_df.head()

Unnamed: 0,NOUN,PRTF,PRCL,NRPO,PRED,PRTS,ADJS,INTJ,PREP,GRND,NUMR,CONJ,ADVB,VERB,INFN,ADJF,COMP,NPRO
0,0.56,0.01,0.01,0.0,0.0,0.02,0.0,0.0,0.01,0.01,0.0,0.01,0.02,0.09,0.0,0.26,0.0,0.0
1,0.62,0.01,0.0,0.0,0.0,0.03,0.01,0.0,0.01,0.0,0.0,0.03,0.05,0.08,0.0,0.16,0.0,0.0
2,0.65,0.02,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.21,0.0,0.0
3,0.75,0.01,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.15,0.0,0.01
4,0.57,0.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.0,0.14,0.0,0.0


Найдем нетипичные документы

In [18]:
import numpy as np
unusual_doc = {}
for name_of_sp in data_df.columns:
    mean = data_df[str(name_of_sp)].mean()
    std = np.std(data_df[str(name_of_sp)])
    dif_more = mean + 2 * std
    dif_less = mean - 2 * std
    data = data_df[(data_df[str(name_of_sp)] > dif_more ) | (data_df[str(name_of_sp)] < dif_less)]
    lst = list(data.index)
    unusual_doc[name_of_sp] = lst
    print('Номера нетипичных документов для части речи {}: {}'.format(str(name_of_sp), list(data.index)))

Номера нетипичных документов для части речи NOUN: [106, 107, 116, 180, 188, 215, 239, 246, 253, 254, 255]
Номера нетипичных документов для части речи PRTF: [4, 10, 32, 94, 173, 194, 197, 201, 228]
Номера нетипичных документов для части речи PRCL: [85, 181, 182, 227, 228, 241]
Номера нетипичных документов для части речи NRPO: []
Номера нетипичных документов для части речи PRED: [21, 38, 87, 88, 100, 163, 214, 256]
Номера нетипичных документов для части речи PRTS: [34, 35, 142, 181, 182, 215, 260]
Номера нетипичных документов для части речи ADJS: [37, 66, 89, 125, 137, 149, 162, 201, 222]
Номера нетипичных документов для части речи INTJ: [96, 169, 191]
Номера нетипичных документов для части речи PREP: [13, 90, 182, 215, 226]
Номера нетипичных документов для части речи GRND: [56, 68, 71, 103, 166, 193, 197, 207, 208, 220, 251, 252, 259]
Номера нетипичных документов для части речи NUMR: [24, 75, 87, 89, 98, 126, 145, 179, 202, 253, 254]
Номера нетипичных документов для части речи CONJ: [1,

Найдем самые нетипичные документы (которые встречаются сразу в нескольких частях речи)

In [41]:
from collections import Counter
list_of_all_docs = []
most_common = []
for docs in list(unusual_doc.keys()):
    for i in unusual_doc[docs]:
        list_of_all_docs.append(i)
c = Counter(list_of_all_docs)
for d in list(c.keys()):
    if c[d] > 1:
        most_common.append(d)

In [43]:
most_common

[116,
 180,
 215,
 239,
 253,
 254,
 4,
 10,
 32,
 94,
 173,
 197,
 201,
 228,
 181,
 182,
 87,
 163,
 34,
 35,
 142,
 89,
 125,
 222,
 13,
 90,
 166,
 193,
 207,
 98,
 152,
 160,
 216,
 49]