### Начало повторяет код на вебинаре, внизу есть ячейка, в которой описаны изменения и после которой они начинаются

In [1]:
import nltk
import pandas as pd
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
news = pd.read_csv("articles.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [3]:
users = pd.read_csv("users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [4]:
from gensim.corpora.dictionary import Dictionary
import re
import numpy as np
from nltk.corpus import stopwords
from razdel import tokenize # https://github.com/natasha/razdel

import pymorphy2  # pip install pymorphy2

In [5]:
stopword_ru = stopwords.words('russian')
len(stopword_ru)

morph = pymorphy2.MorphAnalyzer()

In [6]:
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords
len(stopword_ru)

776

In [7]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [8]:
%%time
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

  from ipykernel import kernelapp as app


Wall time: 31 s


In [9]:
%%time
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

Wall time: 3min 49s


In [10]:
texts = [t for t in news['title'].values]
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [11]:
from gensim.models import LdaModel

In [12]:
%%time
from gensim.models import LdaModel
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10)

Wall time: 38 s


In [13]:
from gensim.test.utils import datapath
temp_file = datapath("model.lda")
lda.save(temp_file)
lda = LdaModel.load(temp_file)

In [14]:
other_texts = [t for t in news['title'].iloc[:3]]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
unseen_doc = other_corpus[2]
print(other_texts[2])
lda[unseen_doc] 

['форвард', 'авангард', 'томаш', 'заборский', 'прокомментировать', 'игра', 'свой', 'команда', 'матч', 'чемпионат', 'кхл', 'против', 'атланта', 'nnnn', 'провести', 'плохой', 'матч', 'нижний', 'новгород', 'против', 'торпедо', 'настраиваться', 'первый', 'минута', 'включиться', 'работа', 'сказать', 'заборский', 'получиться', 'забросить', 'быстрый', 'гол', 'задать', 'хороший', 'темп', 'поединок', 'мочь', 'играть', 'ещё', 'хороший', 'сторона', 'пять', 'очко', 'выезд', 'девять', 'это', 'хороший']


[(4, 0.1946479),
 (10, 0.2218459),
 (13, 0.028275497),
 (20, 0.3436538),
 (23, 0.12834638),
 (24, 0.06602482)]

In [15]:
x=lda.show_topics(num_topics=25, num_words=7,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
for topic,words in topics_words:
    print("topic_{}: ".format(topic)+" ".join(words))

topic_0: человек тело произойти ребёнок полиция пострадать сотрудник
topic_1: украина украинский это военный который киев двигатель
topic_2: знаменитый кремль заключать зеландия австралийский петров нежели
topic_3: газ европа обращение место рейтинг польша германия
topic_4: компания это цена который мочь новый поверхность
topic_5: год это который мочь свой время весь
topic_6: пенсия расследование доказательство ск экипаж дональд лодка
topic_7: доля греция вдвое сибирский круглый прибытие альтернатива
topic_8: ракета налог египет саммит египетский сооружение концерн
topic_9: научный наука университет километр бизнесмен вуз девочка
topic_10: испания тур франция испанский конечность найтись кричать
topic_11: напомнить эксперимент активность доклад городской памятник центр
topic_12: россия это российский который nn год дело
topic_13: инвестиция доллар больной клиент определение писать источник
topic_14: год который это рубль также человек свой
topic_15: расчёт наука академия констатировать

In [16]:
#text = news['title'].iloc[0]

def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [17]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.075994,0.0
1,4896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707036
2,4897,0.0,0.0,0.0,0.0,0.194646,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.343613,0.0,0.0,0.128129,0.066274
3,4898,0.0,0.0,0.0,0.0,0.131817,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071067,0.674557
4,4899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.328531,0.0,0.0,0.044689,0.0,0.0,0.0


In [18]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

###### Тут начигаются изменения. Создал три функции, для каждой статистики отдельно, 3 разбиения, 3 обучения надеюсь правилно понял задание.

In [28]:
user_articles_list = users['articles'].iloc[33]

def get_user_embedding_mean(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.mean(user_vector, 0)
    return user_vector
def get_user_embedding_median(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.median(user_vector, 0)
    return user_vector
def get_user_embedding_max(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.max(user_vector, 0)
    return user_vector

In [29]:
user_embeddings_mean = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_mean(x), 1)])
user_embeddings_mean.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings_mean['uid'] = users['uid'].values
user_embeddings_mean = user_embeddings_mean[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings_mean.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.068371,0.080484,0.022342,0.0,0.02641,0.061192,0.0,0.0,0.0,...,0.012423,0.002277,0.012245,0.02166,0.0,0.005464,0.110022,0.109276,0.069269,0.024725
1,u108690,0.045424,0.105771,0.0,0.003747,0.035931,0.141497,0.0,0.0,0.00217,...,0.0,0.0,0.01808,0.074365,0.0,0.029842,0.007076,0.022939,0.05444,0.078903
2,u108339,0.092573,0.051526,0.0,0.010085,0.008397,0.029295,0.02543,0.0,0.0,...,0.0,0.005898,0.002012,0.055356,0.0,0.062962,0.04207,0.095001,0.079046,0.036721


In [30]:
user_embeddings_median = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_median(x), 1)])
user_embeddings_median.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings_median['uid'] = users['uid'].values
user_embeddings_median = user_embeddings_median[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings_median.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.028701,0.0,0.0,0.0,0.014917,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.07678,0.088124,0.020625,0.0
1,u108690,0.009726,0.086725,0.0,0.0,0.02305,0.151173,0.0,0.0,0.0,...,0.0,0.0,0.0,0.010187,0.0,0.0,0.0,0.018805,0.051599,0.10036
2,u108339,0.089256,0.025614,0.0,0.0,0.0,0.01642,0.017501,0.0,0.0,...,0.0,0.0,0.0,0.025095,0.0,0.028529,0.006893,0.040607,0.026103,0.035725


In [31]:
user_embeddings_max = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_max(x), 1)])
user_embeddings_max.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings_max['uid'] = users['uid'].values
user_embeddings_max = user_embeddings_max[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings_max.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.187133,0.261494,0.076565,0.0,0.068533,0.324931,0.0,0.0,0.0,...,0.062928,0.013665,0.046436,0.129959,0.0,0.032785,0.304922,0.260247,0.236642,0.148351
1,u108690,0.221916,0.246342,0.0,0.022481,0.106638,0.291779,0.0,0.0,0.013019,...,0.0,0.0,0.076182,0.28954,0.0,0.111927,0.042458,0.062137,0.165089,0.139004
2,u108339,0.199869,0.167037,0.0,0.041027,0.050382,0.087904,0.087542,0.0,0.0,...,0.0,0.021485,0.012071,0.153797,0.0,0.212958,0.123956,0.422238,0.327962,0.077761


In [32]:
target = pd.read_csv("users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [34]:
X_mean = pd.merge(user_embeddings_mean, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,churn
0,u105138,0.187133,0.261494,0.076565,0.0,0.068533,0.324931,0.0,0.0,0.0,...,0.013665,0.046436,0.129959,0.0,0.032785,0.304922,0.260247,0.236642,0.148351,0
1,u108690,0.221916,0.246342,0.0,0.022481,0.106638,0.291779,0.0,0.0,0.013019,...,0.0,0.076182,0.28954,0.0,0.111927,0.042458,0.062137,0.165089,0.139004,1
2,u108339,0.199869,0.167037,0.0,0.041027,0.050382,0.087904,0.087542,0.0,0.0,...,0.021485,0.012071,0.153797,0.0,0.212958,0.123956,0.422238,0.327962,0.077761,1


In [35]:
X_median = pd.merge(user_embeddings_median, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,churn
0,u105138,0.187133,0.261494,0.076565,0.0,0.068533,0.324931,0.0,0.0,0.0,...,0.013665,0.046436,0.129959,0.0,0.032785,0.304922,0.260247,0.236642,0.148351,0
1,u108690,0.221916,0.246342,0.0,0.022481,0.106638,0.291779,0.0,0.0,0.013019,...,0.0,0.076182,0.28954,0.0,0.111927,0.042458,0.062137,0.165089,0.139004,1
2,u108339,0.199869,0.167037,0.0,0.041027,0.050382,0.087904,0.087542,0.0,0.0,...,0.021485,0.012071,0.153797,0.0,0.212958,0.123956,0.422238,0.327962,0.077761,1


In [36]:
X_max = pd.merge(user_embeddings_max, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,churn
0,u105138,0.187133,0.261494,0.076565,0.0,0.068533,0.324931,0.0,0.0,0.0,...,0.013665,0.046436,0.129959,0.0,0.032785,0.304922,0.260247,0.236642,0.148351,0
1,u108690,0.221916,0.246342,0.0,0.022481,0.106638,0.291779,0.0,0.0,0.013019,...,0.0,0.076182,0.28954,0.0,0.111927,0.042458,0.062137,0.165089,0.139004,1
2,u108339,0.199869,0.167037,0.0,0.041027,0.050382,0.087904,0.087542,0.0,0.0,...,0.021485,0.012071,0.153797,0.0,0.212958,0.123956,0.422238,0.327962,0.077761,1


In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import itertools
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline

In [39]:
X_train_mean, X_test_mean, y_train_mean, y_test_mean = train_test_split(X_mean[['topic_{}'.format(i) for i in range(25)]], 
                                                    X_mean['churn'], random_state=0)
X_train_median, X_test_median, y_train_median, y_test_median = train_test_split(X_median[['topic_{}'.format(i) for i in range(25)]], 
                                                    X_median['churn'], random_state=0)
X_train_max, X_test_max, y_train_max, y_test_max = train_test_split(X_max[['topic_{}'.format(i) for i in range(25)]], 
                                                    X_max['churn'], random_state=0)

In [40]:
logreg = LogisticRegression()
logreg.fit(X_train_mean, y_train_mean)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [42]:
preds = logreg.predict_proba(X_test_mean)[:, 1]

array([0.12527998, 0.0148598 , 0.65146773, 0.33356286, 0.01053838,
       0.05444139, 0.14278265, 0.02092984, 0.0592599 , 0.12348518])

In [44]:
precision, recall, thresholds = precision_recall_curve(y_test_mean, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))
roc_auc_score(y_test_mean, preds)

Best Threshold=0.294113, F-Score=0.752, Precision=0.756, Recall=0.747


0.9679144136286993

In [45]:
logreg.fit(X_train_median, y_train_median)
preds = logreg.predict_proba(X_test_median)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test_median, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))
roc_auc_score(y_test_median, preds)

Best Threshold=0.252771, F-Score=0.797, Precision=0.733, Recall=0.873


0.9795639281353568

In [48]:
logreg.fit(X_train_max, y_train_max)
preds = logreg.predict_proba(X_test_max)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test_max, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))
roc_auc_score(y_test_max, preds)

Best Threshold=0.381125, F-Score=0.770, Precision=0.768, Recall=0.771


0.968802837374266