In [None]:
from sklearn import model_selection, preprocessing, linear_model, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
import gensim
import re
import pandas as pd
import string
import numpy as np

In [None]:
data = pd.read_excel("отзывы за лето.xls")

In [None]:
data.head()

In [None]:
# Посмотрим на процессинг текста
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
from string import punctuation

exclude = set(punctuation)
sw = set(get_stop_words("ru"))
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in exclude]
    return " ".join(txt)

data['text'] = data['Content'].apply(preprocess_text)
data = data[data['Rating'] != 3]
data['target'] = data['Rating'] > 3

In [None]:
data['target'] = data['target'].astype(int)
data.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['target'], test_size=0.2,
                                                    random_state=13, stratify=data['target'])

In [None]:
count_vect = HashingVectorizer(n_features=1000).fit(X_train.values)

In [None]:
xtrain = count_vect.transform(X_train)
xtest = count_vect.transform(X_test)

In [None]:
lr = linear_model.LogisticRegression(class_weight="balanced").fit(xtrain, y_train)

In [None]:
metrics.roc_auc_score(y_test, lr.predict_proba(xtest)[:, 1])

# Посмотрим на то, как будет работать тематическое моделирование в данном случае

In [None]:
data = data['text'].values.tolist()

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data_words = list(sent_to_words(data))
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
print(trigram_mod[bigram_mod[data_words[0]]])

In [None]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [None]:
data_words_bigrams = make_bigrams(data_words)

In [None]:
id2word = gensim.corpora.Dictionary(data_words_bigrams)
texts = data_words_bigrams
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=1,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
print(lda_model.print_topics())
doc_lda = lda_model[corpus]