### Все необходимые пипы и импорты

In [1]:
from pymystem3 import Mystem
m = Mystem()
import string
import gensim
import gensim.corpora as corpora
from gensim.utils import tokenize
from gensim.summarization.textcleaner import split_sentences

In [2]:
from typing import List
import json
import os
from tqdm.auto import tqdm
import pandas as pd
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")
from nltk.collocations import BigramAssocMeasures, TrigramAssocMeasures, BigramCollocationFinder, TrigramCollocationFinder
import re
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
from natasha import (
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    
    PER,
    NamesExtractor,

    Doc
)


segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)

names_extractor = NamesExtractor(morph_vocab)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

### Предобработка для работы со словами

Функция для разметки именованных сущностей, токенизации и удаления стоп-слов из файлов с текстами

In [76]:
def write_tokens_from_dir(papka: str, sec_name: str):
    for f in os.listdir(papka):
        f = os.path.join(papka, f)
        with open(f, encoding='utf-8') as file:
            text = file.read()
        doc = Doc(text)
        doc.segment(segmenter)
        doc.tag_ner(ner_tagger)
        ents = [span.text.lower() for span in doc.spans]
        tokens_gen = tokenize(text, lowercase=True)
        tokens = []
        for token in tokens_gen:
            if token not in russian_stopwords and token not in ents:
                tokens.append(token)
        tokens_str = ' '.join(tokens)
        with open(f'{sec_name}.txt', 'a', encoding='utf-8') as fi:
            fi.write(tokens_str)

Запись токенов в файл

In [77]:
def get_all_tokens(author: str, sec_name: str, e: bool):
    if e is False:
        write_tokens_from_dir(author, sec_name)
    else:
        author_before = os.path.join(author, 'до эмиграции')
        author_after = os.path.join(author, 'после эмиграции')
        sec_name_before = sec_name+'_до'
        sec_name_after = sec_name+'_после'
        write_tokens_from_dir(author_before, sec_name_before)
        write_tokens_from_dir(author_after, sec_name_after)

In [20]:
soviet = ['Булгаков', 'Шолохов', 'Фадеев', 'Платонов', 'Стругацкие']
emig = ['Солженицын', 'Бунин', 'Зайцев', 'Ремизов', 'Шмелёв']

In [79]:
for auth in soviet:
    author = os.path.join('D:\курсовая_3', auth)
    get_all_tokens(author, auth, False)

In [80]:
for auth in tqdm(emig):
    author = os.path.join('D:\курсовая_3', auth)
    get_all_tokens(author, auth, True)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




Подсчёт токенов для каждого автора

In [81]:
auth_NumOfTok = {}
for a in soviet:
    with open(f'{a}.txt', encoding='utf-8') as fil:
        text = fil.read()
        tokens = text.split()
        auth_NumOfTok[a] = len(tokens)

In [82]:
for b in emig:
    auth_NumOfTok[b] = {}
    with open(f'{b}_до.txt', encoding='utf-8') as fil:
        text = fil.read()
        tokens = text.split()
        auth_NumOfTok[b]['до'] = len(tokens)
    with open(f'{b}_после.txt', encoding='utf-8') as fil:
        text = fil.read()
        auth_NumOfTok[b]['после'] = len(text.split())
        auth_NumOfTok[b]['всего'] = auth_NumOfTok[b]['до'] + auth_NumOfTok[b]['после']

In [83]:
auth_NumOfTok

{'Булгаков': 254748,
 'Шолохов': 439344,
 'Фадеев': 225490,
 'Платонов': 247558,
 'Стругацкие': 1237595,
 'Солженицын': {'до': 502870, 'после': 308324, 'всего': 811194},
 'Бунин': {'до': 58995, 'после': 110018, 'всего': 169013},
 'Зайцев': {'до': 73017, 'после': 294319, 'всего': 367336},
 'Ремизов': {'до': 156316, 'после': 82127, 'всего': 238443},
 'Шмелёв': {'до': 154873, 'после': 250581, 'всего': 405454}}

Лемматизация

In [None]:
for a in tqdm(soviet):
    with open(f'{a}.txt', encoding='utf-8') as fil:
        text = fil.read()
        lem_text_list = m.lemmatize(text)
        lem_text = ''.join(lem_text_list)
    with open(f'{a}_леммы.txt', 'a', encoding='utf-8') as fil:
        fil.write(lem_text)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

In [6]:
for b in tqdm(emig):
    with open(f'{b}_до.txt', encoding='utf-8') as fil:
        text = fil.read()
        lem_text_list = m.lemmatize(text)
        lem_text = ''.join(lem_text_list)
    with open(f'{b}_до_леммы.txt', 'a', encoding='utf-8') as fil:
        fil.write(lem_text)
    with open(f'{b}_после.txt', encoding='utf-8') as fil:
        text = fil.read()
        lem_text_list_2 = m.lemmatize(text)
        lem_text_2 = ''.join(lem_text_list_2)
    with open(f'{b}_после_леммы.txt', 'a', encoding='utf-8') as fil:
        fil.write(lem_text_2)
    with open(f'{b}_леммы.txt', 'a', encoding='utf-8') as fil:
        fil.write(lem_text+lem_text_2)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




Стругацких пришлось лемматизировать по частям - не хватало мощности.

In [7]:
with open('Стругацкие.txt', encoding='utf-8') as fil:
    text = fil.read()
    l = len(text) + 1 
part1 = text[0:l//2]
part2 = text[l//2:]
lem_text_1_list = m.lemmatize(part1)
lem_text_2_list = m.lemmatize(part1)
lem_text_1 = ''.join(lem_text_1_list)
lem_text_2 = ''.join(lem_text_2_list)
lem_text = lem_text_1 + lem_text_2
with open('Стругацкие_леммы.txt', 'a', encoding='utf-8') as fil:
    fil.write(lem_text)

In [6]:
corpus = []
all_texts = soviet + emig
for a in tqdm(all_texts):
    with open(f'{a}_леммы.txt', encoding='utf-8') as f:
        text = f.read()
        corpus.append(text)
len(corpus)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




10

### Эксперименты со словами

##### Эксперимент 1.

Корпус: 10 документов – по одному на каждого автора. 

In [7]:
vectorizer = TfidfVectorizer(sublinear_tf=True)
tfidf_matrix = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()
dense = tfidf_matrix.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names, index=all_texts)

In [9]:
for a in tqdm(all_texts):
    with open(f'{a}_100.txt', 'a', encoding='utf-8') as f:
        n = pd.Series(df.loc[f'{a}'])
        f.write(n[n > 0].sort_values(ascending=False)[:100].to_string())

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




##### Эксперимент 2.

Корпус: 5 документов советских писателей, 5 документов - тексты эмигрантов до эмиграции и 1 документ – тексты всех эмигрантов после эмиграции.

In [50]:
corpus2 = []
emig_after = ''
index2 = []

for b in tqdm(emig):
    with open (f'{b}_после_леммы.txt', encoding='utf-8') as f:
        text = f.read()
    emig_after = ''.join(text)
    with open(f'{b}_до_леммы.txt', encoding='utf-8') as f:
        text = f.read()
        corpus2.append(text)
    index2.append(b)
with open('Эмигранты_после.txt', 'a', encoding='utf-8') as f:
    f.write(emig_after)
    index2.append('Эмигранты')

corpus2.append(emig_after)

for a in tqdm(soviet):
    with open(f'{a}_леммы.txt', encoding='utf-8') as fil:
        text = fil.read()
    corpus2.append(text)
    index2.append(a)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [27]:
vectorizer = TfidfVectorizer(sublinear_tf=True)
tfidf_matrix = vectorizer.fit_transform(corpus2)
feature_names = vectorizer.get_feature_names()
dense = tfidf_matrix.todense()
denselist = dense.tolist()
df2 = pd.DataFrame(denselist, columns=feature_names, index=index2)

In [32]:
with open('Эмигранты_после_100.txt', 'a', encoding='utf-8') as f:
    n = pd.Series(df2.loc['Эмигранты'])
    f.write(n[n > 0].sort_values(ascending=False)[:100].to_string())

##### Эксперимент 3

5 небольших экспериментов: для каждого эмигранта корпус состоит из 5 документов советских писателей и 1 документа – текстов каждого эмигранта до эмиграции.

In [18]:
def auth_vs_sov(em):
    indexi = soviet
    indexi.append(em)
    corpusi = []
    for a in indexi:
        if a == indexi[-1]:
             with open(f'{a}_до_леммы.txt', encoding='utf-8') as fil:
                    text = fil.read()
        else:
            with open(f'{a}_леммы.txt', encoding='utf-8') as fil:
                text = fil.read()
        corpusi.append(text)
    vectorizer = TfidfVectorizer(sublinear_tf=True)
    tfidf_matrix = vectorizer.fit_transform(corpusi)
    feature_names = vectorizer.get_feature_names()
    dense = tfidf_matrix.todense()
    denselist = dense.tolist()
    dfi = pd.DataFrame(denselist, columns=feature_names, index=indexi)
    with open(f'{em}_до_vs_soviet_100.txt', 'a', encoding='utf-8') as f:
        n = pd.Series(dfi.loc[f'{em}'])
        f.write(n[n > 0].sort_values(ascending=False)[:100].to_string())

In [21]:
for em in tqdm(emig):
    auth_vs_sov(em)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




##### Эксперимент 4

5 небольших экспериментов для каждого автора-эмигранта: корпус состоял из 5 документов советских писателей, 5 документов писателей-эмигрантов до эмиграции и 1 документа с текстами автора-эмигранта, написанными после эмиграции.

In [8]:
def after_vs_before_and_sov(em):
    indexi = []
    corpusi = []
    for a in emig:
        indexi.append(a)
        with open(f'{a}_до_леммы.txt', encoding='utf-8') as fil:
            text = fil.read()
        corpusi.append(text)
    for a in soviet:
        indexi.append(a)
        with open(f'{a}_леммы.txt', encoding='utf-8') as fil:
            text = fil.read()
        corpusi.append(text)
    with open(f'{em}_после_леммы.txt', encoding='utf-8') as fil:
        text = fil.read()
        corpusi.append(text)
        indexi.append(f'{em}_после')
    vectorizer = TfidfVectorizer(sublinear_tf=True)
    tfidf_matrix = vectorizer.fit_transform(corpusi)
    feature_names = vectorizer.get_feature_names()
    dense = tfidf_matrix.todense()
    denselist = dense.tolist()
    dfi = pd.DataFrame(denselist, columns=feature_names, index=indexi)
    with open(f'{em}_vs_before_and_sov_100.txt', 'a', encoding='utf-8') as f:
        n = pd.Series(dfi.loc[f'{em}_после'])
        f.write(n[n > 0].sort_values(ascending=False)[:100].to_string())

In [9]:
for em in tqdm(emig):
    after_vs_before_and_sov(em)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




##### Эксперимент 5.

Корпус – 5 текстов советских писателей в 1 документе, и 5 документов – текстов эмигрантов после эмиграции. 

In [15]:
index3 = emig
corpus3 = []
for a in tqdm(index3):
    with open(f'{a}_после_леммы.txt', encoding='utf-8') as f:
        text = f.read()
    corpus3.append(text)
index3.append('Советские')
all_sov = ''
for b in soviet:
    with open(f'{b}_леммы.txt', encoding='utf-8') as f:
        text = f.read()
    all_sov = ''.join(text)
corpus3.append(all_sov)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [16]:
vectorizer = TfidfVectorizer(sublinear_tf=True)
tfidf_matrix = vectorizer.fit_transform(corpus3)
feature_names = vectorizer.get_feature_names()
dense = tfidf_matrix.todense()
denselist = dense.tolist()
df3 = pd.DataFrame(denselist, columns=feature_names, index=index3)
with open('soviet_vs_emig_after_100.txt', 'a', encoding='utf-8') as f:
    n = pd.Series(df3.loc[f'{index3[-1]}'])
    f.write(n[n > 0].sort_values(ascending=False)[:100].to_string())

Много слов Стругацких, посмотрим без них. Корпус – 4 текста советских писателей в 1 документе, и 5 документов – текстов эмигрантов после эмиграции.

In [16]:
index7 = emig
corpus7 = []
for a in tqdm(index7):
    with open(f'{a}_после_леммы.txt', encoding='utf-8') as f:
        text = f.read()
    corpus7.append(text)
index7.append('Советские')
all_sov_wo_stru = ''
for b in soviet:
    if b != 'Стругацкие':
        with open(f'{b}_леммы.txt', encoding='utf-8') as f:
            text = f.read()
        all_sov_wo_stru = ''.join(text)
corpus7.append(all_sov_wo_stru)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [17]:
vectorizer = TfidfVectorizer(sublinear_tf=True)
tfidf_matrix = vectorizer.fit_transform(corpus7)
feature_names = vectorizer.get_feature_names()
dense = tfidf_matrix.todense()
denselist = dense.tolist()
df7 = pd.DataFrame(denselist, columns=feature_names, index=index7)
with open('soviet_woStrug_vs_emig_after_100.txt', 'a', encoding='utf-8') as f:
    n = pd.Series(df7.loc[f'{index7[-1]}'])
    f.write(n[n > 0].sort_values(ascending=False)[:100].to_string())

### Эксперименты с биграммами и триграммами

Создаю объект, где хранятся всякие метрики для биграмм (bigram_measures). В finder_sov хранятся все биграммы произведений советских писателей. В sov_len хранится количество всех биграмм.

In [96]:
with open('Солженицын.txt', encoding='utf-8') as f:
    text = f.read()
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(text)
print(finder[:10])

FileNotFoundError: [Errno 2] No such file or directory: 'Солженицын.txt'

### Топики

##### Советские писатели

In [6]:
def lemm_stories(papka: str):
    auth_lemmas = []
    for f in os.listdir(papka):
        f = os.path.join(papka, f)
        with open(f, encoding='utf-8') as file:
            text = file.read()
        doc = Doc(text)
        doc.segment(segmenter)
        doc.tag_morph(morph_tagger)
        lemmas = []
        for token in doc.tokens:
            token.lemmatize(morph_vocab)
            if token.text not in russian_stopwords and token.pos in ['NOUN', 'ADJ', 'VERB', 'ADV']:
                    lemmas.append(token.lemma)
        auth_lemmas.append(lemmas)
    return auth_lemmas

In [72]:
all_sov_list = []
for auth in tqdm(soviet):
    author = os.path.join('D:\курсовая_3', auth)
    lem = lemm_stories(author)
    all_sov_list.extend(lem)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [73]:
len(all_sov_list)

116

Создаём словарь и корпус

In [86]:
id2word = corpora.Dictionary(all_sov_list)
texts = all_sov_list
corpus4 = [id2word.doc2bow(text) for text in texts]

In [87]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus4[:1]]

[[('1', 3),
  ('1-го', 2),
  ('10', 1),
  ('11', 3),
  ('13', 4),
  ('14', 1),
  ('14-го', 1),
  ('15', 1),
  ('15-го', 1),
  ('16', 3),
  ('16-го', 1),
  ('17', 4),
  ('17-го', 2),
  ('17-ий', 2),
  ('18', 1),
  ('18-го', 3),
  ('19', 1),
  ('19-го', 2),
  ('19-м', 1),
  ('1916', 3),
  ('1917', 7),
  ('1918', 5),
  ('1922', 1),
  ('2', 1),
  ('2-го', 1),
  ('2-е', 1),
  ('2-й', 1),
  ('20-го', 1),
  ('21', 1),
  ('21-го', 1),
  ('21-дневный', 1),
  ('25', 1),
  ('25-го', 1),
  ('27-го', 1),
  ('29', 1),
  ('3', 2),
  ('6-го', 1),
  ('7', 1),
  ('8-го', 1),
  ('9', 1),
  ('i', 1),
  ('x', 1),
  ('абажур', 4),
  ('абсолютно', 6),
  ('авдотья', 2),
  ('автоклав', 1),
  ('автоматический', 1),
  ('автомобиль', 1),
  ('агроном', 5),
  ('адреналин', 1),
  ('адрес', 1),
  ('аккорд', 2),
  ('аккуратно', 1),
  ('актер', 1),
  ('акушер', 1),
  ('акушерка', 30),
  ('акушерка-фельдшерица', 1),
  ('акушерский', 7),
  ('акушерство', 5),
  ('акцент', 1),
  ('алексеевский', 1),
  ('алеть', 1),
  ('алы

Тренируем модель

In [88]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus4,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

Записываем результаты в файл

In [92]:
with open('Советские топики.txt', 'a', encoding='utf-8') as f:
    for topic in lda_model.print_topics():
        f.write(str(topic[0])+' - '+str(topic[-1])+'\n')

##### Писатели-эмигранты после эмиграции

In [78]:
all_emig_after_list = []
for auth in tqdm(emig):
    author_all = os.path.join('D:\курсовая_3', auth)
    author = os.path.join(author_all, 'после эмиграции')
    lem = lemm_stories(author)
    all_emig_after_list.extend(lem)
print(len(all_emig_after_list))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


50


Создаём словарь и корпус

In [79]:
id2word = corpora.Dictionary(all_emig_after_list)
texts = all_emig_after_list
corpus5 = [id2word.doc2bow(text) for text in texts]

Тренируем модель

In [80]:
lda_model2 = gensim.models.ldamodel.LdaModel(corpus=corpus5,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

Записываем результаты в файл

In [93]:
with open('Эмигрантские_после топики.txt', 'a', encoding='utf-8') as f:
    for topic in lda_model2.print_topics():
        f.write(str(topic[0])+' - '+str(topic[-1])+'\n')

##### Писатели-эмигранты до эмиграции

In [10]:
all_emig_before_list = []
for auth in tqdm(emig):
    author_all = os.path.join('D:\курсовая_3', auth)
    author = os.path.join(author_all, 'до эмиграции')
    lem = lemm_stories(author)
    all_emig_before_list.extend(lem)
print(len(all_emig_before_list))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


42


Создаём словарь и корпус

In [12]:
id2word = corpora.Dictionary(all_emig_before_list)
texts = all_emig_before_list
corpus6 = [id2word.doc2bow(text) for text in texts]

Тренируем модель

In [13]:
lda_model3 = gensim.models.ldamodel.LdaModel(corpus=corpus6,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

Записываем результаты в файл

In [14]:
with open('Эмигрантские_до топики.txt', 'a', encoding='utf-8') as f:
    for topic in lda_model3.print_topics():
        f.write(str(topic[0])+' - '+str(topic[-1])+'\n')