In [22]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
import re

from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import classification_report

from nltk.stem import PorterStemmer
from nltk.stem.snowball import RussianStemmer
from nltk.corpus import stopwords

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim

import pickle

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
posts = pd.read_csv('Data/final_posts.csv').text

In [15]:
posts = posts.map(lambda x: re.sub(r'[– \-,\.?!\\/()»«“”#%=;:*+_=]+', ' ', x.lower()))
posts = [simple_preprocess(line, deacc=True) for line in posts]
posts[4]

['сегодня',
 'ваш',
 'любимыи',
 'человек',
 'признается',
 'том',
 'что',
 'ему',
 'оказывается',
 'никогда',
 'не',
 'нравилось',
 'представителях',
 'пола',
 'какое',
 'то',
 'качество',
 'которое',
 'буиным',
 'цветом',
 'цветет',
 'вас',
 'лишь',
 'вы',
 'заставили',
 'его',
 'посмотреть',
 'на',
 'это',
 'самое',
 'качество',
 'по',
 'другому',
 'можете',
 'быть',
 'польщены']

In [16]:
bigram = gensim.models.Phrases(posts, min_count=3, threshold=5) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[posts])  
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [17]:
bigram_mod[posts[4]]

['сегодня_ваш',
 'любимыи_человек',
 'признается',
 'том_что',
 'ему',
 'оказывается',
 'никогда',
 'не',
 'нравилось',
 'представителях',
 'пола',
 'какое_то',
 'качество',
 'которое',
 'буиным',
 'цветом',
 'цветет',
 'вас',
 'лишь',
 'вы',
 'заставили',
 'его',
 'посмотреть_на',
 'это',
 'самое',
 'качество',
 'по_другому',
 'можете',
 'быть',
 'польщены']

In [19]:
def remove_stopwords(texts):
    stop_words = stopwords.words("russian")
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def stemming(texts):
    stemmer = RussianStemmer()
    texts_out = []
    for text in texts:
        texts_out.append([stemmer.stem(word) for word in text])
    return texts_out

In [24]:
posts = remove_stopwords(posts)

In [25]:
posts_bigrams = make_bigrams(posts)
posts_trigrams = make_trigrams(posts)

In [54]:
posts_stem = stemming(posts)

In [55]:
posts_stem = list(map(lambda a:
                      ['вы' if x=='стрельц' else x for x in a], posts_stem))

In [56]:
# Create Dictionary
id2word = corpora.Dictionary(posts_stem)
# Create Corpus
texts = posts_stem
# filter extremes
id2word.filter_extremes(no_below=3, no_above=0.75)
id2word.compactify()
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[0:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1)]]


In [57]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[0:1]]

[[('ваш', 1),
  ('вниман', 1),
  ('выб', 1),
  ('выясн', 1),
  ('избега', 1),
  ('кол', 1),
  ('котор', 1),
  ('месяц', 1),
  ('может', 1),
  ('незащищен', 1),
  ('неуверен', 1),
  ('попробу', 1),
  ('постара', 1),
  ('почувствова', 1),
  ('приниэт', 1),
  ('проявлен', 1),
  ('сегодн', 1),
  ('сеичас', 1),
  ('сил', 1),
  ('сильн', 1),
  ('ситуац', 2),
  ('собствен', 1),
  ('тех', 1),
  ('треб', 1),
  ('трудноват', 1),
  ('уязвим', 1),
  ('характер', 1),
  ('черт', 1),
  ('эт', 1)]]

In [58]:
len(id2word)

10125

In [59]:
%%time
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=,
                                            random_state=100,
                                            update_every=2,
                                            chunksize=30,
                                            passes=5,
                                            alpha='auto',
                                            per_word_topics=True)  

Wall time: 33.4 s


In [60]:
lda_model.print_topics(num_words=6)

[(0,
  '0.030*"ден" + 0.014*"врем" + 0.013*"дел" + 0.013*"могут" + 0.012*"возможн" + 0.010*"котор"'),
 (1,
  '0.036*"ваш" + 0.024*"эт" + 0.023*"человек" + 0.020*"сегодн" + 0.013*"любим" + 0.013*"будут"'),
 (2,
  '0.033*"завтр" + 0.028*"сво" + 0.018*"нов" + 0.016*"жизн" + 0.014*"работ" + 0.013*"возможн"')]

In [61]:
# Compute Perplexity on train
print('\nPerplexity on train: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


Perplexity on train:  -7.589638325328963


In [62]:
# Compute Perplexity on test
corpus_test = [id2word.doc2bow(text) for text in posts_stem]
print('\nPerplexity on test: ', lda_model.log_perplexity(corpus_test))


Perplexity on test:  -7.589638328715409
