# Keywords search

In [1]:
import lxml
import xml.etree.ElementTree as ET
import string
import nltk
from nltk.tokenize import word_tokenize
import string
import pymorphy2
import stop_words
from stop_words import get_stop_words
import pandas as pd
import loglikelihood
import numpy

In [2]:
def get_corpus(file_name):
    
    tree = ET.parse(file_name)
    root = tree.getroot()
    
    texts = ''
    for corpus in root:
        for unit in corpus:
            if unit.tag == 'body':
                texts += unit.text
                
    return texts

In [3]:
def preprocessing(text):
    morph = pymorphy2.MorphAnalyzer()
    text = ''.join([i for i in text if i not in set(string.punctuation)])
    text = text.lower()
    words = word_tokenize(text)
    lemmas = []
    for word in words:
        lemmas.append(morph.parse(word)[0].normal_form)
    lemmas = [i for i in lemmas if i not in get_stop_words('ru')]
    return lemmas

Возьмем два корпуса:
    - RefC - тематический корпус
    - SpecC - контрастный корпус

И проведем некоторую предобработку:
    - удаление пунктуации
    - приведение к нижнему регистру
    - токенизация
    - лемматизация

In [4]:
RefC = get_corpus('reference_corpus.xml')
SpecC = get_corpus('corpus_pravoslavnyh_tekstov.xml')

In [5]:
RefC_lemm = preprocessing(RefC)
SpecC_lemm = preprocessing(SpecC)

In [6]:
lenRefC = len(RefC_lemm)
lenSpecC = len(SpecC_lemm)
lenC = lenRefC + lenSpecC
C_lemm = RefC_lemm + SpecC_lemm

Частотный словник (наблюдаемые статистики и ожидаемые частоты):

In [7]:
bow = set(C_lemm)
bow_Series = pd.Series(list(bow))

In [8]:
df = pd.DataFrame({'word' : bow_Series,
                             'word freq in SpecC' : bow_Series.apply(SpecC_lemm.count),
                             'word freq in RefC' : bow_Series.apply(RefC_lemm.count)})
df['word freq in C'] = df['word freq in RefC'] + df['word freq in SpecC']
df['not word freq in RefC'] = lenRefC - df['word freq in RefC']
df['not word freq in SpecC'] = lenSpecC - df['word freq in SpecC']
df['not word freq in C'] = lenC - df['word freq in C']
df['expected freq in SpecC'] = lenSpecC * (df['word freq in C'] / (lenC))
df['expected freq in RefC'] = lenRefC * (df['word freq in C'] / (lenC))

In [9]:
df.head()

Unnamed: 0,word,word freq in RefC,word freq in SpecC,word freq in C,not word freq in RefC,not word freq in SpecC,not word freq in C,expected freq in SpecC,expected freq in RefC
0,делаться,3,1,4,6058,5928,11986,1.977982,2.022018
1,двое,0,6,6,6061,5923,11984,2.966972,3.033028
2,настроенность,0,1,1,6061,5928,11989,0.494495,0.505505
3,советовать,2,0,2,6059,5929,11988,0.988991,1.011009
4,материя,0,2,2,6061,5927,11988,0.988991,1.011009


Применим статистику **Loglikelihood**.

#### $Loglikelihood = 2*((SpecC(w)*ln(\frac{SpecC(w)}{E_1})) + (RefC(w)*ln(\frac{RefC(w)}{E_2})))$

#### $E_i = \frac{N_i*\sum(O_i)}{\sum(N_i)}$

$w$ - слово  

$SpecC(w)$ –  частота слова в тематическом корпусе;    

$RefC(w)$ –  частота слова в контрастном корпусе;  

In [9]:
df['LogLikelihood'] = pd.Series([loglikelihood.llr(numpy.matrix([[df.loc[df['word'] == i, 'word freq in SpecC'].iloc[0], 
                                                                df.loc[df['word'] == i, 'word freq in RefC'].iloc[0]], 
                                                                [df.loc[df['word'] == i, 'not word freq in SpecC'].iloc[0], 
                                                                df.loc[df['word'] == i, 'not word freq in RefC'].iloc[0]]])) 
                                 for i in df['word']])

In [10]:
df_llh = df.sort_values(by=['LogLikelihood'], ascending=False)
df_llh = df_llh.reset_index(drop=True)

Топ-10 ключевых слов:

In [80]:
df_llh[:10]

Unnamed: 0,word,word freq in RefC,word freq in SpecC,word freq in C,not word freq in RefC,not word freq in SpecC,not word freq in C,expected freq in SpecC,expected freq in RefC,LogLikelihood,Weirdness
0,брак,1,137,138,6060,5792,11852,68.240367,69.759633,13.567193,140.050093
1,любовь,5,57,62,6056,5872,11928,30.658716,31.341284,7.250727,11.653803
2,муж,11,64,75,6050,5865,11915,37.087156,37.912844,6.546671,5.947715
3,бог,11,61,72,6050,5868,11918,35.60367,36.39633,6.291246,5.668916
4,ребёнок,22,82,104,6039,5847,11886,51.427523,52.572477,6.204635,3.810255
5,должный,9,53,62,6052,5876,11928,30.658716,31.341284,5.977486,6.019996
6,жена,27,73,100,6034,5856,11890,49.449541,50.550459,4.815526,2.763897
7,дух,2,22,24,6059,5907,11966,11.86789,12.13211,4.469961,11.244898
8,семейство,2,20,22,6059,5909,11968,10.878899,11.121101,4.185846,10.222635
9,супруг,2,20,22,6059,5909,11968,10.878899,11.121101,4.185846,10.222635


Топ-10 неключевых слов:

In [81]:
df_llh[-10:]

Unnamed: 0,word,word freq in RefC,word freq in SpecC,word freq in C,not word freq in RefC,not word freq in SpecC,not word freq in C,expected freq in SpecC,expected freq in RefC,LogLikelihood,Weirdness
3818,присутствие,14,1,15,6047,5928,11975,7.417431,7.582569,-3.629854,0.073019
3819,сделать,23,4,27,6038,5925,11963,13.351376,13.648624,-3.793678,0.177785
3820,дверь,15,1,16,6046,5928,11974,7.911927,8.088073,-3.79592,0.068151
3821,понимать,15,1,16,6046,5928,11974,7.911927,8.088073,-3.79592,0.068151
3822,ах,16,1,17,6045,5928,11973,8.406422,8.593578,-3.956095,0.063891
3823,место,17,1,18,6044,5928,11972,8.900917,9.099083,-4.110949,0.060133
3824,лицо,38,9,47,6023,5920,11943,23.241284,23.758716,-4.322456,0.242115
3825,глаз,22,2,24,6039,5927,11966,11.86789,12.13211,-4.370128,0.092933
3826,брат,34,4,38,6027,5925,11952,18.790826,19.209174,-5.150025,0.120266
3827,рука,35,3,38,6026,5926,11952,18.790826,19.209174,-5.574238,0.087623


Применим статистику **Weirdness**.

#### $Weirdness (w) =  \frac{SpecC(w)}{|SpeC|} * {\frac{|RefC|}{RefC(w)}}$

$w$ - слово  

$SpecC(w)$ –  частота слова в тематическом корпусе

$|SpecC|$ – объём тематического корпуса

$RefC(w)$ –  частота слова в контрастном корпусе

$|RefC|$ – объём контрастного корпуса

In [37]:
df['Weirdness'] = (df['word freq in SpecC'] / lenSpecC) / (df['word freq in RefC'] / lenRefC)

In [12]:
df_weird = df.sort_values(by=['Weirdness'], ascending=False)[df.Weirdness != numpy.inf]
df_weird = df_weird[df_weird.Weirdness != 0.0]
df_weird = df_weird.reset_index(drop=True)

  """Entry point for launching an IPython kernel.


In [83]:
df_weird[:10]

Unnamed: 0,word,word freq in RefC,word freq in SpecC,word freq in C,not word freq in RefC,not word freq in SpecC,not word freq in C,expected freq in SpecC,expected freq in RefC,LogLikelihood,Weirdness
0,брак,1,137,138,6060,5792,11852,68.240367,69.759633,13.567193,140.050093
1,семейный,1,14,15,6060,5915,11975,7.417431,7.582569,3.707972,14.311688
2,любовь,5,57,62,6056,5872,11928,30.658716,31.341284,7.250727,11.653803
3,сила,1,11,12,6060,5918,11978,5.933945,6.066055,3.159381,11.244898
4,дух,2,22,24,6059,5907,11966,11.86789,12.13211,4.469961,11.244898
5,супруг,2,20,22,6059,5909,11968,10.878899,11.121101,4.185846,10.222635
6,становиться,1,10,11,6060,5919,11979,5.43945,5.56055,2.958665,10.222635
7,господь,2,20,22,6059,5909,11968,10.878899,11.121101,4.185846,10.222635
8,цель,1,10,11,6060,5919,11979,5.43945,5.56055,2.958665,10.222635
9,семейство,2,20,22,6059,5909,11968,10.878899,11.121101,4.185846,10.222635


In [84]:
df_weird[-10:]

Unnamed: 0,word,word freq in RefC,word freq in SpecC,word freq in C,not word freq in RefC,not word freq in SpecC,not word freq in C,expected freq in SpecC,expected freq in RefC,LogLikelihood,Weirdness
531,брат,34,4,38,6027,5925,11952,18.790826,19.209174,-5.150025,0.120266
532,совершенно,9,1,10,6052,5928,11980,4.944954,5.055046,-2.681705,0.113585
533,глаз,22,2,24,6039,5927,11966,11.86789,12.13211,-4.370128,0.092933
534,рука,35,3,38,6026,5926,11952,18.790826,19.209174,-5.574238,0.087623
535,подумать,13,1,14,6048,5928,11976,6.922936,7.077064,-3.457222,0.078636
536,присутствие,14,1,15,6047,5928,11975,7.417431,7.582569,-3.629854,0.073019
537,понимать,15,1,16,6046,5928,11974,7.911927,8.088073,-3.79592,0.068151
538,дверь,15,1,16,6046,5928,11974,7.911927,8.088073,-3.79592,0.068151
539,ах,16,1,17,6045,5928,11973,8.406422,8.593578,-3.956095,0.063891
540,место,17,1,18,6044,5928,11972,8.900917,9.099083,-4.110949,0.060133


In [36]:
print(list(df_weird.word).index('сделать'))
print(list(df_weird.word).index('лицо'))

523
510
