In [1]:
import json, os
import pandas as pd
from nltk.corpus import stopwords
import numpy as np
from pymorphy2 import MorphAnalyzer
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
morph = MorphAnalyzer()
stops = set(stopwords.words('russian'))

In [2]:
pd.set_option('display.max_colwidth', 1000)

In [3]:
PATH_TO_DATA = '/Users/dariabakshandaeva/Documents/data'

In [4]:
files = [os.path.join(PATH_TO_DATA, file) for file in os.listdir(PATH_TO_DATA) if file.endswith('jsonlines')]
print(files)

['/Users/dariabakshandaeva/Documents/data/ng_1.jsonlines', '/Users/dariabakshandaeva/Documents/data/ng_0.jsonlines']


In [5]:
data = pd.concat([pd.read_json(file, lines=True) for file in files][:1], axis=0, ignore_index=True)

In [6]:
def evaluate(true_kws, predicted_kws):
    assert len(true_kws) == len(predicted_kws)
    
    precisions = []
    recalls = []
    f1s = []
    jaccards = []
    
    for i in range(len(true_kws)):
        true_kw = set(true_kws[i])
        predicted_kw = set(predicted_kws[i])
        
        tp = len(true_kw & predicted_kw)
        union = len(true_kw | predicted_kw)
        fp = len(predicted_kw - true_kw)
        fn = len(true_kw - predicted_kw)
        
        if (tp+fp) == 0:
            prec = 0
        else:
            prec = tp / (tp + fp)
        
        if (tp+fn) == 0:
            rec = 0
        else:
            rec = tp / (tp + fn)
        if (prec+rec) == 0:
            f1 = 0
        else:
            f1 = (2*(prec*rec))/(prec+rec)
            
        jac = tp / union
        
        precisions.append(prec)
        recalls.append(rec)
        f1s.append(f1)
        jaccards.append(jac)
    print('Precision - ', round(np.mean(precisions), 2))
    print('Recall - ', round(np.mean(recalls), 2))
    print('F1 - ', round(np.mean(f1s), 2))
    print('Jaccard - ', round(np.mean(jaccards), 2))

**1 решение: используем не word.normal_form, а задаем именительный падеж (word.inflect({'nomn'}).word)**

In [12]:
from string import punctuation
from nltk.corpus import stopwords
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0] for word in words if word and word not in stops]
    words = [word.inflect({'nomn'}).word for word in words if word.tag.POS == 'NOUN' or word.tag.POS == 'ADJF']

    return words

In [13]:
data['content_norm'] = data['content'].apply(normalize)

In [14]:
evaluate(data['keywords'], data['content_norm'].apply(lambda x: [x[0] for x in Counter(x).most_common(10)]))

Precision -  0.14
Recall -  0.27
F1 -  0.18
Jaccard -  0.1


**F1 -  0.18 - улучшение**

**2 решение: меняем только количество ключевых слов: Counter(x).most_common(6) вместо Counter(x).most_common(10)**

In [45]:
from string import punctuation
from nltk.corpus import stopwords
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0] for word in words if word and word not in stops]
    words = [word.normal_form for word in words if word.tag.POS == 'NOUN']

    return words

In [46]:
data['content_norm'] = data['content'].apply(normalize)

In [47]:
evaluate(data['keywords'], data['content_norm'].apply(lambda x: [x[0] for x in Counter(x).most_common(6)]))

Precision -  0.17
Recall -  0.2
F1 -  0.17
Jaccard -  0.1


**F1 -  0.17 - улучшение**

**3 решение: меняем только параметр min_df=5 на min_df=2 (то есть игнорируем слова, которые появились менее чем в 2 документах)**

In [19]:
from string import punctuation
from nltk.corpus import stopwords
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0] for word in words if word and word not in stops]
    words = [word.normal_form for word in words if word.tag.POS == 'NOUN']

    return words

In [20]:
data['content_norm'] = data['content'].apply(normalize)

In [21]:
data['content_norm_str'] = data['content_norm'].apply(' '.join)

In [39]:
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2)

In [40]:
tfidf.fit(data['content_norm_str'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [41]:
id2word = {i:word for i,word in enumerate(tfidf.get_feature_names())}

In [42]:
texts_vectors = tfidf.transform(data['content_norm_str'])

In [43]:
keywords = [[id2word[w] for w in top] for top in texts_vectors.toarray().argsort()[:,:-11:-1]]

In [44]:
evaluate(data['keywords'], keywords)

Precision -  0.13
Recall -  0.25
F1 -  0.17
Jaccard -  0.1


**F1 -  0.17 - улучшение**

**Комбинации этих изменений приводят к еще более заметным изменениями (F1 -  0.19)**

**4 решение: используем TermExtractor (неудачно, есть в тетрадке 3)**

**5 решение: используем Rake (неудачно, есть в тетрадке 3)**

**6 решение: только дополняем список стоп-слов**

In [48]:
from string import punctuation
from nltk.corpus import stopwords
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))
new_stops = ["год", "страна", "главный", "развитый", "март", "сегодня", "бумажный", "нынешний", "назад", "популярный"]
stops = stops.union(new_stops)


def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0] for word in words if word and word not in stops]
    words = [word.normal_form for word in words if word.tag.POS == 'NOUN']

    return words

In [49]:
data['n_content_norm'] = data['content'].apply(normalize)

In [50]:
evaluate(data['keywords'], data['n_content_norm'].apply(lambda x: [x[0] for x in Counter(x).most_common(10)]))

Precision -  0.13
Recall -  0.26
F1 -  0.17
Jaccard -  0.1


**F1 -  0.17 - улучшение!**

**Комбинация методов, опять же, приводит к F1 - 0.19**

**Итог: 4 решения ведут к улучшению, 2 неудачных решения (все эти решения есть в первых 3-х тетрадках)**