In [4]:
import pandas as pd
import pickle
import nltk
nltk.download('punkt')

nltk.download('stopwords')
from nltk.corpus import stopwords

from gensim.models.phrases import Phrases, Phraser

import itertools

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Денис\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Денис\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Source data

In [None]:
geo = pd.read_excel('geo_comment.xlsx')
comment = geo['comment']

# Prepare text

First, we should lemmatize text, remove punctuation (leave only dots)

### Yandex Mystem

In [39]:
from pymystem3 import Mystem


def lemma(word):    
    m = Mystem()
    return m.lemmatize(word)[0]

### Better use Morphy

In [6]:
import pymorphy2


def lemma(word):
    morph = pymorphy2.MorphAnalyzer()
    p = morph.parse(word)[0]
    return p.normal_form

In [None]:
%%time

# tokenize to words
# lemmatize them
# and then rebuild the text
def clean_text(text):
    parts = nltk.word_tokenize(text)
    
    restrict_parts = [':', ',', ';']
    
    # allow punctuation
    punct = ['.', '-']
    
    result = ''
    
    for part in parts:
        if part not in restrict_parts:
            if part in punct:
                word = part
            # lemmatize
            else:
                word = ' ' + lemma(part) 
            result += word
    
    return result
    
    
#comment['comment'] = comment['comment'].apply(clean_text)
new_comment = []
i = 0
for c in comment:
    new_comment.append(clean_text(c))
    i += 1
    print(f'Done {i} of ', len(comment))
    
f = open('normalized_comments.pkl', 'rb')
pickle.dump(new_comment, f)
f.close()

Done 1 of  70382
Done 2 of  70382
Done 3 of  70382
Done 4 of  70382
Done 5 of  70382
Done 6 of  70382
Done 7 of  70382
Done 8 of  70382
Done 9 of  70382
Done 10 of  70382
Done 11 of  70382
Done 12 of  70382
Done 13 of  70382
Done 14 of  70382
Done 15 of  70382
Done 16 of  70382
Done 17 of  70382
Done 18 of  70382
Done 19 of  70382
Done 20 of  70382
Done 21 of  70382
Done 22 of  70382
Done 23 of  70382
Done 24 of  70382
Done 25 of  70382
Done 26 of  70382
Done 27 of  70382
Done 28 of  70382
Done 29 of  70382
Done 30 of  70382
Done 31 of  70382
Done 32 of  70382
Done 33 of  70382
Done 34 of  70382
Done 35 of  70382
Done 36 of  70382
Done 37 of  70382
Done 38 of  70382
Done 39 of  70382
Done 40 of  70382
Done 41 of  70382
Done 42 of  70382
Done 43 of  70382
Done 44 of  70382
Done 45 of  70382
Done 46 of  70382
Done 47 of  70382
Done 48 of  70382
Done 49 of  70382
Done 50 of  70382
Done 51 of  70382
Done 52 of  70382
Done 53 of  70382
Done 54 of  70382
Done 55 of  70382
Done 56 of  70382
D

# Chunking

In [None]:
%%time

# load sentences tokenizer
sent_token = nltk.data.load('tokenizers/punkt/russian.pickle')

# each comment to sentences
sentences = comment.apply(sent_token.tokenize)

# concencate in one list
sentences = itertools.chain.from_iterable(sentences.tolist())

# tokenize to words as gensim accept sentences in list of words format
sent_list = []
for sent in sentences:
    sent_list += [nltk.word_tokenize(sent)]

## create model

In [None]:
%%time

# create phrases model     
phrases = Phrases(sent_list, min_count=10, threshold=10)

# Export the trained model = use less RAM, faster processing. Model updates no longer possible.
bigram = Phraser(phrases)

# save model
bigram.save("bigram_model.pkl")

## apply model

In [None]:
%%time

def chunks(text, model):
    result = ''
    
    sentences = sent_token.tokenize(text)
    for sent in sentences:
        for part in model[nltk.word_tokenize(sent)]:
            # rebuild text
            if part != '.':
                result += part
            else:
                result += ' ' + part
        

bigram = Phraser.load("bigram_model.pkl")

chunked_comments = [chunks(x, bigram) for x in new_comment]    

# Document-term matrix

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer

# we must remove stop words
sw = stopwords.words('russian')
with open('comment_stop_words.txt', encoding='utf-8') as f:
    our_stop_words = f.read().splitlines()
    f.close()

sw = sw + our_stop_words

# We will remove digits
word_pattern = r'[a-zA-Zа-яА-Я]+'

cv = CountVectorizer(stop_words=sw, token_pattern=word_pattern)
data_cv = cv.fit_transform(chunked_comments)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())

#del cv
#del data_cv

# Save document term matrix to file
Need a lot of memory for saving it as Dataframe. So we save it as sparse matrix

In [5]:
f = open('dtm.pkl', 'wb')
pickle.dump(data_cv, f)
f.close()

# save countvectorizer
f = open('cv.pkl', 'wb')
pickle.dump(cv, f)
f.close()

# Load document term matrix

In [None]:
f = open('dtm.pkl', 'wb')
data_dtm = pickle.load(f)
f.close()

In [None]:
print('Text count: ', data_dtm.shape[0], ' Word count: ', data_dtm.shape[1])

# Word stats

In [None]:
%%time

words = {}

for word in data_dtm.columns:
    # first lemmatize colum
    w = lemma(word)
    
    if w in words:
        words[w] += data_dtm[word].sum()
    else:
        words[w] = data_dtm[word].sum() 

# save to DF, sort
w = pd.DataFrame.from_dict(words, orient='index')
w.sort_values(by=0,ascending=False, inplace=True)

In [None]:
# save to csv
w.to_csv('word_stats.csv', encoding='utf-8')

f = open('words.pkl', 'wb')
pickle.dump(words, f)
f.close()

In [11]:
f = open('words.pkl', 'rb')
words = pickle.load(f)
f.close()

In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud
plt.figure(figsize=(15,12))

wc = WordCloud(background_color="white", colormap="Dark2", max_font_size=150, random_state=42)

wc = wc.generate_from_frequencies(words)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")

plt.show()

# Look at freq of words
First of all, i've added some words for our stop dictionary.

As we see, most the messages are related to land use and construction.

# Topic modeling
* LDA
* LSI (Latent Semantic Indexing)
* NMF Non-Negative Matrix Factorization 

## LDA
As it in video

In [15]:
%%time
from gensim import matutils, models
import scipy.sparse

# we have to transpose
# convert from df to sparse matrix
sparse_counts = scipy.sparse.csr_matrix(data_dtm.transpose())

# sparse matrix to corpus
corpus = matutils.Sparse2Corpus(sparse_counts)

corpus

<gensim.matutils.Sparse2Corpus at 0x1777777ca48>

In [18]:
f = open('sparse2corpus.pkl', 'wb')
pickle.dump(corpus, f)
f.close()

In [37]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=10, passes=10)
lda.print_topics()

[(0,
  '0.110*"москвы" + 0.073*"города" + 0.046*"землепользования" + 0.034*"округа" + 0.030*"административном" + 0.029*"северо" + 0.027*"западном" + 0.026*"территории" + 0.025*"правил" + 0.021*"функциональной"'),
 (1,
  '0.034*"против" + 0.030*"категорически" + 0.029*"территории" + 0.020*"героев" + 0.018*"проект" + 0.018*"придомовой" + 0.017*"д" + 0.016*"застройки" + 0.016*"внутри" + 0.013*"строительства"'),
 (2,
  '0.074*"территории" + 0.056*"шоссе" + 0.036*"требуем" + 0.030*"серп" + 0.030*"молот" + 0.030*"энтузиастов" + 0.021*"комплекса" + 0.019*"процентом" + 0.019*"отвести" + 0.019*"вал"'),
 (3,
  '0.031*"панфиловцев" + 0.027*"дома" + 0.021*"проекта" + 0.021*"жителей" + 0.021*"территории" + 0.021*"свободы" + 0.020*"адресу" + 0.018*"д" + 0.016*"жилого" + 0.015*"проект"'),
 (4,
  '0.017*"москвы" + 0.016*"использования" + 0.015*"земельных" + 0.015*"участков" + 0.014*"кодекса" + 0.014*"градостроительного" + 0.013*"города" + 0.013*"пзз" + 0.012*"москве" + 0.011*"соответствии"'),
 (5,
  '