<a href="https://colab.research.google.com/github/Gratisfo/Parentents-and-children/blob/main/topic_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd

In [5]:
data = pd.read_csv('data.csv')
del data['Unnamed: 0']
data = data.dropna().reset_index(drop=True)
data

Unnamed: 0,speaker,text,role,date,title
0,#taratora,"И ты еще, мошенник, смеешь мне указывать! Да з...",parent,1788,krylov-prokazniki
1,#taratora,То есть чего-нибудь хорошенького.,parent,1788,krylov-prokazniki
2,#taratora,"Как, бездельник! да разве не для того он сюда ...",parent,1788,krylov-prokazniki
3,#taratora,"Да его ль вина, бестия, что вы все здесь ходит...",parent,1788,krylov-prokazniki
4,#taratora,Я окончала!.. Как я рада! Элегия эта беспример...,parent,1788,krylov-prokazniki
...,...,...,...,...,...
20297,#sineus,"Ты, сказывают, хотел Славян и прочие народы от...",children,1786,ekaterina-vtoraja-iz-zhizni-rjurika
20298,#truvor,Какие же ты имел при том намерения?,children,1786,ekaterina-vtoraja-iz-zhizni-rjurika
20299,#oskold,"К чему прение тут, где дело само по себе ясно?",children,1786,ekaterina-vtoraja-iz-zhizni-rjurika
20300,#rjurik,"Бодрость духа твоего, князь Вадим, не унывает;...",parent,1786,ekaterina-vtoraja-iz-zhizni-rjurika


In [7]:
def prop_date(date):
  try:
    return int(date)
  except:
    return int(date[:4]) 

data['date'] = data['date'].apply(lambda x: prop_date(x))

# Preprocessing

In [None]:
!pip install razdel

In [None]:
!pip install pymorphy2[fast]

In [None]:
import re
import pymorphy2
import nltk
from razdel import sentenize, tokenize
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
morph = pymorphy2.MorphAnalyzer()

In [54]:
def preproc_text(text):
  # tokenize
  tokens = [_.text for _ in list(tokenize(text))]
  
  # lower register
  tokens = [w.lower() for w in tokens]
  
  # remove puntuation
  table = str.maketrans('', '', string.punctuation)
  stripped = [w.translate(table) for w in tokens]
  
  # remove numbers, save only letters
  words = [word for word in stripped if word.isalpha()]
  
  # delete stop-words
  stop_words = stopwords.words('russian')
  stop_words.extend(['мой', 'твой', 'ваш', 'свой', 'такой', 'какой', 'наш', 'который'])
  stop_words = set(stop_words)
  words = [w for w in words if not w in stop_words]

  # to normal form
  allowed = ['NOUN', 'ADJF', 'ADJS', 'VERB', 'ADVB']
  lemmas = [morph.parse(w)[0].normal_form for w in words if morph.parse(w)[0].tag.POS in allowed] 
  
  return lemmas

In [55]:
data['clean_text'] = data['text'].apply(lambda x: preproc_text(x)) 

In [56]:
# Join replicas to text for each speaker
join_text = data.groupby(['speaker']).agg(list)
join_text['role'] = join_text['role'].apply(lambda x: x[0])
join_text['date'] = join_text['date'].apply(lambda x: x[0])
join_text['title'] = join_text['title'].apply(lambda x: x[0])
join_text

Unnamed: 0_level_0,text,role,date,title,clean_text
speaker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
#Anja,"[Пройдемте здесь. Ты, мама, помнишь, какая это...",children,1903,chekhov-vishnevyi-sad,"[[пройти, мама, помнить, комната], [спасть, до..."
#Dashenka,[Они хочут свою образованность показать и всег...,children,1889,chekhov-svadba,"[[хотеть, образованность, говорить, непонятный..."
#FedorIvanovich,"[Здорово, ребята!, Жарко! Водки выпить надо., ...",children,1889,chekhov-leshii,"[[ребята], [жарко, водка], [юля], [бросить, юл..."
#GordejKarpych,"[Что распелись! Горланят, точно мужичье!, А эт...",parent,1853,ostrovsky-bednost-ne-porok,"[[распеться, горланить, точно, мужичий], [глуп..."
#Larisa,"[До свиданья, Вася!, Я сейчас все за Волгу смо...",children,1874,ostrovsky-bespridannitsa,"[[свидание, вася], [волга, смотреть, тот, стор..."
...,...,...,...,...,...
#vtoraja_knjazhna,"[Какие складочки!, Про это знает целый свет.]",children,1822,griboyedov-gore-ot-uma,"[[какой, складочка], [знать, целый, свет]]"
#zamir,"[Приехал мой отец! Какой мне день счастливый!,...",children,1784,knyazhnin-hvastun,"[[приехать, отец, день, счастливый], [клятвопр..."
#zasypkin,"[А уж как я рад, что вы приехали к нам, Анисья...",parent,1886,mamin-sibirjak-zolotopromyshlenniki,"[[рад, приехать, анисья, тихонович, рад, ангел..."
#zavloh,"[В каком ты станешь мя, Оснельда, виде зрети?!...",parent,1747,sumarokov-horev,"[[какой, стать, оснельда, вид, зрести], [будит..."


In [57]:
# split texts by period
data18 = join_text[(join_text['date'] > 1700) & (join_text['date'] < 1799)]
data19 = join_text[(join_text['date'] > 1800) & (join_text['date'] < 1899)]
data20 = join_text[join_text['date'] > 1900]

In [58]:
# split texts by role
parents_18 = data18[data18['role']=='parent'].clean_text.values
parents_18 =  [item for sublist in parents_18 for item in sublist]

children_18 = data18[data18['role']=='children'].clean_text.values
children_18 =  [item for sublist in children_18 for item in sublist]

parents_19 = data19[data19['role']=='parent'].clean_text.values
parents_19 =  [item for sublist in parents_19 for item in sublist]

children_19 = data19[data19['role']=='children'].clean_text.values
children_19 =  [item for sublist in children_19 for item in sublist]

parents_20 = data20[data20['role']=='parent'].clean_text.values
parents_20 =  [item for sublist in parents_20 for item in sublist]

children_20 = data20[data20['role']=='children'].clean_text.values
children_20 =  [item for sublist in children_20 for item in sublist]

# Topic Modeling

In [None]:
!pip install pyLDAvis

In [None]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models 
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [24]:
def builg_ngrams(data_words):
  bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
  trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
  bigram_mod = gensim.models.phrases.Phraser(bigram)
  trigram_mod = gensim.models.phrases.Phraser(trigram)
  return bigram_mod, trigram_mod

def make_bigrams(texts):
    bigram_mod, trigram_mod = builg_ngrams(texts)
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    bigram_mod, trigram_mod = builg_ngrams(texts)
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def topic_modeling(words):
    bigram, trigram = builg_ngrams(words)
    id2word = corpora.Dictionary(words)
    corpus = [id2word.doc2bow(text) for text in words]
    return corpus, id2word

#LDA model for parents' texts

## 18 centure

In [None]:
corpus, id2word = topic_modeling(parents_18)

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [82]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.083*"свой" + 0.049*"быть" + 0.047*"человек" + 0.035*"пойти" + 0.023*"дом" '
  '+ 0.017*"виноватый" + 0.015*"всякий" + 0.014*"надежда" + 0.014*"год" + '
  '0.014*"вон"'),
 (1,
  '0.148*"мой" + 0.072*"твой" + 0.028*"думать" + 0.027*"сударыня" + '
  '0.027*"сделать" + 0.025*"должный" + 0.020*"помиловать" + 0.014*"воля" + '
  '0.013*"жена" + 0.012*"злодей"'),
 (2,
  '0.109*"сударь" + 0.059*"мочь" + 0.055*"дело" + 0.053*"весь" + 0.030*"очень" '
  '+ 0.022*"радость" + 0.017*"изволить" + 0.015*"деньга" + 0.015*"ум" + '
  '0.013*"вид"'),
 (3,
  '0.039*"сказать" + 0.033*"взять" + 0.032*"сам" + 0.031*"стать" + '
  '0.030*"время" + 0.026*"иметь" + 0.023*"бояться" + 0.022*"казаться" + '
  '0.020*"сердце" + 0.020*"жизнь"'),
 (4,
  '0.049*"ваш" + 0.040*"право" + 0.030*"дать" + 0.030*"любить" + 0.027*"друг" '
  '+ 0.020*"оставить" + 0.016*"жена" + 0.015*"ведать" + 0.012*"будить" + '
  '0.012*"слушаться"'),
 (5,
  '0.063*"наш" + 0.045*"муж" + 0.043*"сей" + 0.036*"государь" + 0.029*"тот" + '


In [None]:
# Compute Perplexity
Perplexity = lda_model.log_perplexity(corpus)   # a measure of how good the model is. lower the better.
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=parents_18, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

In [84]:
print('\nPerplexity: ', Perplexity )
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.796771522685829

Coherence Score:  0.5528117558727152


In [85]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

## 19 centure

In [None]:
corpus, id2word = topic_modeling(parents_19)

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [88]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.051*"один" + 0.022*"брать" + 0.021*"чепрак" + 0.020*"кондратьич" + '
  '0.020*"тихон" + 0.017*"домой" + 0.013*"беда" + 0.013*"выйти" + '
  '0.011*"простить" + 0.011*"дама"'),
 (1,
  '0.070*"сказать" + 0.055*"мой" + 0.046*"видеть" + 0.035*"любить" + '
  '0.028*"рука" + 0.020*"точно" + 0.016*"делать" + 0.013*"грех" + 0.011*"саша" '
  '+ 0.011*"сын"'),
 (2,
  '0.103*"дело" + 0.046*"быть" + 0.029*"другой" + 0.024*"голова" + '
  '0.018*"этот" + 0.017*"муж" + 0.017*"пора" + 0.015*"сюда" + 0.014*"матушка" '
  '+ 0.013*"самый"'),
 (3,
  '0.042*"ваш" + 0.023*"сделать" + 0.023*"понимать" + 0.018*"стоить" + '
  '0.018*"господин" + 0.016*"белоносов" + 0.016*"молчать" + 0.016*"народ" + '
  '0.014*"разговор" + 0.013*"послать"'),
 (4,
  '0.063*"говорить" + 0.060*"человек" + 0.041*"знать" + 0.033*"такой" + '
  '0.032*"хотеть" + 0.028*"деньга" + 0.023*"бог" + 0.022*"твой" + '
  '0.020*"очень" + 0.017*"хороший"'),
 (5,
  '0.063*"свой" + 0.040*"мочь" + 0.036*"дом" + 0.031*"казаться" + '
  '0.02

In [None]:
# Compute Perplexity
Perplexity = lda_model.log_perplexity(corpus)   # a measure of how good the model is. lower the better.
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=parents_19, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

In [90]:
print('\nPerplexity: ', Perplexity )
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.566495988882286

Coherence Score:  0.4561042058041763


In [91]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

## 20 centure

In [92]:
corpus, id2word = topic_modeling(parents_20)



In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [94]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.080*"идти" + 0.017*"сюда" + 0.017*"дядя" + 0.017*"обидно" + '
  '0.015*"николай" + 0.013*"михайло" + 0.012*"день" + 0.012*"работать" + '
  '0.011*"скоро" + 0.010*"трудиться"'),
 (1,
  '0.040*"человек" + 0.038*"сказать" + 0.029*"мочь" + 0.027*"мой" + '
  '0.026*"семён" + 0.022*"свой" + 0.020*"жена" + 0.020*"отец" + 0.019*"жизнь" '
  '+ 0.018*"ступать"'),
 (2,
  '0.088*"говорить" + 0.028*"быть" + 0.022*"великий" + 0.020*"слышать" + '
  '0.019*"весь" + 0.015*"ходить" + 0.014*"пора" + 0.012*"положить" + '
  '0.012*"друг" + 0.010*"поехать"'),
 (3,
  '0.062*"дело" + 0.038*"павел" + 0.036*"бог" + 0.026*"сын" + 0.024*"послать" '
  '+ 0.017*"должный" + 0.017*"сегодня" + 0.016*"твой" + 0.016*"просить" + '
  '0.013*"врать"'),
 (4,
  '0.028*"сам" + 0.027*"пойти" + 0.021*"говорить" + 0.020*"душа" + '
  '0.017*"слушать" + 0.015*"жить" + 0.015*"колюшка" + 0.015*"чувствовать" + '
  '0.014*"сердце" + 0.012*"остаться"'),
 (5,
  '0.071*"хотеть" + 0.035*"помнить" + 0.022*"понять" + 0.019*"сидеть

In [None]:
# Compute Perplexity
Perplexity = lda_model.log_perplexity(corpus)   # a measure of how good the model is. lower the better.
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=parents_20, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

In [96]:
print('\nPerplexity: ', Perplexity )
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.13281553363572

Coherence Score:  0.5970926161392855


In [97]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

#LDA model for children' texts

## 18 centure

In [None]:
corpus, id2word = topic_modeling(children_18)

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [102]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.032*"казаться" + 0.027*"мой" + 0.019*"мера" + 0.019*"глаз" + '
  '0.017*"судьба" + 0.014*"воля" + 0.014*"виноватый" + 0.013*"надежда" + '
  '0.011*"россия" + 0.011*"хахахах"'),
 (1,
  '0.114*"мой" + 0.042*"ваш" + 0.027*"идти" + 0.025*"сердце" + 0.018*"француз" '
  '+ 0.017*"право" + 0.016*"жизнь" + 0.015*"скоро" + 0.014*"дама" + '
  '0.014*"слышать"'),
 (2,
  '0.050*"отец" + 0.026*"изволить" + 0.023*"смерть" + 0.018*"взять" + '
  '0.017*"любезный" + 0.015*"таков" + 0.014*"напрасно" + 0.014*"ныне" + '
  '0.013*"имя" + 0.013*"сударь"'),
 (3,
  '0.064*"мочь" + 0.045*"любовь" + 0.042*"дело" + 0.032*"думать" + '
  '0.026*"постоять" + 0.024*"другой" + 0.018*"родитель" + 0.016*"ум" + '
  '0.015*"всякий" + 0.013*"мать"'),
 (4,
  '0.056*"человек" + 0.049*"который" + 0.049*"такой" + 0.029*"сказать" + '
  '0.028*"любить" + 0.023*"свет" + 0.021*"говорить" + 0.016*"надобный" + '
  '0.016*"крест" + 0.014*"жена"'),
 (5,
  '0.049*"сей" + 0.030*"русский" + 0.028*"очень" + 0.023*"добродетель" 

In [None]:
# Compute Perplexity
Perplexity = lda_model.log_perplexity(corpus)   # a measure of how good the model is. lower the better.
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=children_18, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

In [118]:
print('\nPerplexity: ', Perplexity )
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.325303256915332

Coherence Score:  0.5599544837551791


In [105]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

## 19 centure

In [None]:
corpus, id2word = topic_modeling(children_19)

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [108]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.037*"делать" + 0.032*"время" + 0.027*"дать" + 0.022*"тяжело" + '
  '0.021*"прощать" + 0.021*"чувство" + 0.019*"сделать" + 0.018*"сюда" + '
  '0.018*"ждать" + 0.016*"нина"'),
 (1,
  '0.062*"хотеть" + 0.062*"сказать" + 0.044*"видеть" + 0.039*"который" + '
  '0.035*"маменька" + 0.033*"свой" + 0.028*"деньга" + 0.024*"день" + '
  '0.023*"понимать" + 0.022*"стать"'),
 (2,
  '0.040*"твой" + 0.024*"матушка" + 0.020*"пора" + 0.018*"рука" + 0.017*"друг" '
  '+ 0.016*"сметь" + 0.016*"искусство" + 0.015*"работа" + 0.014*"молодость" + '
  '0.014*"мысль"'),
 (3,
  '0.075*"быть" + 0.037*"господин" + 0.035*"бог" + 0.033*"саша" + '
  '0.032*"горский" + 0.029*"сумасшедший" + 0.028*"этот" + 0.023*"красть" + '
  '0.014*"нищий" + 0.012*"справедливо"'),
 (4,
  '0.048*"ваш" + 0.040*"идти" + 0.035*"брать" + 0.031*"наш" + 0.022*"глаз" + '
  '0.022*"свет" + 0.021*"казаться" + 0.019*"хороший" + 0.019*"угодный" + '
  '0.017*"сидеть"'),
 (5,
  '0.072*"любить" + 0.052*"мой" + 0.039*"жизнь" + 0.037*"год" +

In [None]:
# Compute Perplexity
Perplexity = lda_model.log_perplexity(corpus)   # a measure of how good the model is. lower the better.
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=children_19, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

In [110]:
print('\nPerplexity: ', Perplexity )
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.260231994944363

Coherence Score:  0.4789933319835115


In [111]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

## 20 centure

In [None]:
corpus, id2word = topic_modeling(children_20)

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [114]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.059*"жизнь" + 0.036*"мама" + 0.035*"старик" + 0.032*"человек" + '
  '0.030*"маша" + 0.023*"очень" + 0.022*"таков" + 0.020*"идти" + 0.017*"время" '
  '+ 0.016*"смотреть"'),
 (1,
  '0.099*"знать" + 0.018*"дом" + 0.016*"заявить" + 0.015*"прислать" + '
  '0.013*"обидеться" + 0.013*"скандал" + 0.013*"поцелуй" + 0.013*"любовь" + '
  '0.012*"влюбиться" + 0.012*"имя"'),
 (2,
  '0.071*"сказать" + 0.057*"какой" + 0.036*"почему" + 0.031*"один" + '
  '0.028*"год" + 0.024*"виноватый" + 0.018*"молчать" + 0.015*"неправда" + '
  '0.015*"уйти" + 0.015*"стараться"'),
 (3,
  '0.066*"видеть" + 0.030*"стать" + 0.029*"иванович" + 0.026*"думать" + '
  '0.021*"нужный" + 0.019*"слышать" + 0.016*"сергеевич" + 0.014*"штука" + '
  '0.014*"сложный" + 0.013*"павел"'),
 (4,
  '0.026*"ждать" + 0.025*"княжна" + 0.019*"рука" + 0.018*"эллен" + '
  '0.017*"прелестный" + 0.017*"чтонибыть" + 0.015*"подумать" + 0.015*"её" + '
  '0.015*"первый" + 0.015*"сократ"'),
 (5,
  '0.052*"любить" + 0.035*"казаться" + 0.034*"

In [None]:
# Compute Perplexity
Perplexity = lda_model.log_perplexity(corpus)   # a measure of how good the model is. lower the better.
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=children_20, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

In [116]:
print('\nPerplexity: ', Perplexity )
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.325303256915332

Coherence Score:  0.5599544837551791


In [117]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis