### Урок 2. Профилирование пользователей. Сегментация: unsupervised learning (clustering, LDA/ARTM), supervised (multi/binary classification)

In [1]:
import pandas as pd

Наши новости

In [2]:
news = pd.read_csv("articles.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


Загрузим пользователей и списки последних прочитанных новостей

In [3]:
users = pd.read_csv("users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


Итак, нам нужно получить векторные представления пользователей на основе прочитанным ими новостей и самих новостей

### 1. Получаем векторные представления новостей

In [4]:
!pip install gensim



In [5]:
#from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

In [6]:
!pip install razdel



In [7]:
!pip install pymorphy2



In [8]:
!pip install pymorphy2



In [12]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [9]:
#предобработка текстов
import re
import numpy as np
from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize

from razdel import tokenize # https://github.com/natasha/razdel
#!pip install razdel

import pymorphy2  # pip install pymorphy2

In [10]:
stopword_ru = stopwords.words('russian')
len(stopword_ru)

morph = pymorphy2.MorphAnalyzer()

In [11]:
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords
len(stopword_ru)

776

Убирем из текста латиницу

In [12]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    text = re.sub("[A-z]", '', text) # убираем из текста латиницу
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [13]:
%%time
#Запускаем очистку текста. Будет долго...
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

  from ipykernel import kernelapp as app


Wall time: 35 s


In [14]:
%%time
#Запускаем лемматизацию текста. Будет очень долго...
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

Wall time: 6min 15s


А теперь в 3 строчки обучим нашу модель

In [15]:
#сформируем список наших текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

# Create a corpus from a list of texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

Что такое common_dictionary и как он выглядит

In [16]:
common_dictionary[10]

'директор'

Все просто - это словарь наших слов

Запускаем обучение

In [17]:
%%time
from gensim.models import LdaModel
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10)

Wall time: 56.8 s


In [18]:
from gensim.test.utils import datapath
# Save model to disk.
temp_file = datapath("model.lda")
lda.save(temp_file)

# Load a potentially pretrained model from disk.
lda = LdaModel.load(temp_file)

Обучили модель. Теперь 2 вопроса:

1. как выглядят наши темы
2. как получить для документа вектор значений (вероятности принадлежности каждой теме)

In [19]:
# Create a new corpus, made of previously unseen documents.
other_texts = [t for t in news['title'].iloc[:3]]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[2]
print(other_texts[2])
lda[unseen_doc] 

['форвард', 'авангард', 'томаш', 'заборский', 'прокомментировать', 'игра', 'свой', 'команда', 'матч', 'чемпионат', 'кхл', 'против', 'атлантам', 'провести', 'плохой', 'матч', 'нижний', 'новгород', 'против', 'торпедо', 'настраиваться', 'первый', 'минута', 'включиться', 'работа', 'сказать', 'заборский', 'получиться', 'забросить', 'быстрый', 'гол', 'задать', 'хороший', 'темп', 'поединок', 'мочь', 'играть', 'ещё', 'хороший', 'сторона', 'пять', 'очко', 'выезд', 'девять', 'это', 'хороший']


[(5, 0.47291338), (10, 0.06699602), (13, 0.3918452), (14, 0.048488002)]

In [20]:
x=lda.show_topics(num_topics=25, num_words=7,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

#Below Code Prints Only Words 
for topic,words in topics_words:
    print("topic_{}: ".format(topic)+" ".join(words))

topic_0: рекомендовать выдать депутат налоговый констатировать ндс подъём
topic_1: компания население фестиваль производитель иск эффективность добавлять
topic_2: год который это исследование человек мочь сша
topic_3: ракета снижение вероятно употребление настаивать сон расширение
topic_4: фонд полоса горизонт звёздный выручка корзина эндрю
topic_5: планета космос топливо катастрофа первый экипаж дональд
topic_6: годовой арабский приобретать вслед расширение тбилиси грузия
topic_7: рейс продолжительность аэропорт соцсеть орден вдвое сочи
topic_8: превысить фрагмент югра мак джонс оцепить запорожский
topic_9: восток напомнить это всё активность подчеркнуть разместить
topic_10: смерть температура взрыв рак который метод группа
topic_11: газ путин граница продукция экипаж египет испытание
topic_12: остаток осложнение тереза закрываться роджер квебек швейцарец
topic_13: это год который свой мочь весь всё
topic_14: вицепремьер белый снять задать направлять фунт сдача
topic_15: год тыс млн к

Очень неплохо - большинство тем вполне можно описать о чем они

Давайте напишем функцию, которая будет нам возвращать векторное представление новости

In [182]:
#text = news['title'].iloc[0]

def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [183]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074752,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.859912,0.0
1,4896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4897,0.0,0.0,0.0,0.0,0.0,0.472904,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4898,0.0,0.0,0.077573,0.015813,0.0,0.326101,0.0,0.0,0.0,...,0.0,0.0,0.0,0.014138,0.0,0.0,0.026507,0.0,0.0,0.0
4,4899,0.217223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.034129,0.0,0.0,0.0,0.0,0.601904,0.0


Прекрасно, мы получили вектора наших новостей! И даже умеем интерпретировать получившиеся темы.

Можно двигаться далее

### Следующий шаг - векторные представления пользователей

In [184]:
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [185]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

In [186]:
doc_dict[293622]

array([0.03591073, 0.04665267, 0.20065923, 0.01906193, 0.01770151,
       0.07549366, 0.        , 0.01628084, 0.        , 0.        ,
       0.        , 0.07192532, 0.        , 0.50563312, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [187]:
user_articles_list = users['articles'].iloc[33]

def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.median(user_vector, 0)
    return user_vector

In [188]:
get_user_embedding(user_articles_list)

array([0.        , 0.        , 0.08594252, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.26163615, 0.        ,
       0.01771302, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.23652121, 0.        ])

Интересовался новостями с топиками topic_3, topic_14 (что-то про политику и государство)

In [189]:
#users['articles'].iloc[33]

In [190]:
#" ".join(news[news['doc_id']==323186]['title'].iloc[0])

Теперь получим эмбединги для всех пользователей и проверим их качество на конкретной downstream-задаче

In [191]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.0,0.0,0.086359,0.009531,0.0,0.0,0.0,0.0,0.0,...,0.158143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068538,0.0
1,u108690,0.0,0.013366,0.051668,0.0,0.0,0.005683,0.0,0.0,0.0,...,0.058213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.185099,0.0
2,u108339,0.0,0.0,0.107092,0.0,0.0,0.029699,0.0,0.0,0.0,...,0.115907,0.078343,0.010194,0.0,0.0,0.0,0.0,0.022561,0.284186,0.0


Датасет готов - можно попробовать обучить модель. Загрузим нашу разметку

In [192]:
target = pd.read_csv("users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [193]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,churn
0,u105138,0.0,0.0,0.086359,0.009531,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068538,0.0,0
1,u108690,0.0,0.013366,0.051668,0.0,0.0,0.005683,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.185099,0.0,1
2,u108339,0.0,0.0,0.107092,0.0,0.0,0.029699,0.0,0.0,0.0,...,0.078343,0.010194,0.0,0.0,0.0,0.0,0.022561,0.284186,0.0,1


In [194]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import itertools

import matplotlib.pyplot as plt

%matplotlib inline

In [195]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [196]:
logreg = LogisticRegression()
#обучим наш пайплайн
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [197]:
#наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]
preds[:10]

array([0.11407111, 0.03882586, 0.50401099, 0.10187303, 0.02114451,
       0.01324575, 0.10428638, 0.03273921, 0.03802632, 0.0899001 ])

In [198]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

### Рассчитаем Precision, Recall, F_score

In [199]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
thresholds = thresholds[ix]
fscore = fscore[ix],
precision = precision[ix],
recall = recall[ix]
#print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        #fscore[ix],
                                                                        #precision[ix],
                                                                        #recall[ix]))

In [200]:
ras = roc_auc_score(y_test, preds)
ras

0.9430734345020059

2. Модифицировать код функции get_user_embedding таким образом, чтобы считалось не среднее (как в примере np.mean), а медиана. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: roc auc, precision/recall/f_score (для 3 последних - подобрать оптимальный порог с помощью precision_recall_curve, как это делалось на уроке)

In [201]:
def get_user_embedding1(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.median(user_vector, 0)
    return user_vector

In [202]:
user_embeddings1 = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding1(x), 1)])
user_embeddings1.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings1['uid'] = users['uid'].values
user_embeddings1 = user_embeddings1[['uid']+['topic_{}'.format(i) for i in range(25)]]

In [203]:
X = pd.merge(user_embeddings1, target, 'left')

In [204]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [205]:
logreg = LogisticRegression()
#обучим наш пайплайн
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [206]:
#наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]
preds[:10]

array([0.11407111, 0.03882586, 0.50401099, 0.10187303, 0.02114451,
       0.01324575, 0.10428638, 0.03273921, 0.03802632, 0.0899001 ])

In [207]:
precision1, recall1, thresholds1 = precision_recall_curve(y_test, preds)
fscore1 = (2 * precision1 * recall1) / (precision1 + recall1)
# locate the index of the largest f score
ix = np.argmax(fscore1)
thresholds1 = thresholds1[ix]
fscore1 = fscore1[ix],
precision1 = precision1[ix],
recall1 = recall1[ix]


In [208]:
ras1 = roc_auc_score(y_test, preds)
ras1

0.9430734345020059


3. Повторить п.2, но используя уже не медиану, а max

In [209]:
def get_user_embedding2(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.max(user_vector, 0)
    return user_vector

In [210]:
user_embeddings2 = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding2(x), 1)])
user_embeddings2.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings2['uid'] = users['uid'].values
user_embeddings2 = user_embeddings2[['uid']+['topic_{}'.format(i) for i in range(25)]]

In [211]:
X = pd.merge(user_embeddings2, target, 'left')

In [212]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [213]:
logreg = LogisticRegression()
#обучим наш пайплайн
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [214]:
#наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]
preds[:10]

array([0.13614938, 0.01653182, 0.58940701, 0.09122468, 0.0269179 ,
       0.03563638, 0.23095348, 0.00158646, 0.02025753, 0.17842692])

In [215]:
precision2, recall2, thresholds2 = precision_recall_curve(y_test, preds)
fscore2 = (2 * precision2 * recall2) / (precision2 + recall2)
# locate the index of the largest f score
ix = np.argmax(fscore2)
thresholds2 = thresholds2[ix]
fscore2 = fscore2[ix],
precision2 = precision2[ix],
recall2 = recall2[ix]
#print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds2[ix], 
                                                                        #fscore2[ix],
                                                                        #precision2[ix],
                                                                        #recall2[ix]))

In [216]:
ras2 = roc_auc_score(y_test, preds)
ras2

0.9447363218791791

In [220]:
mean_ = [thresholds, precision, recall, fscore, ras]
median_ = [thresholds1, precision1, recall1 ,fscore1, ras1]
max_ = [thresholds2, precision2, recall2,fscore2, ras2]

In [223]:
metrics = pd.DataFrame({'mean': mean_,
                        'median': median_,
                        'max': max_}).T
metrics.columns=['thresholds', 'precision', 'recall','fscore', 'roc_auc_score']

In [224]:
metrics

Unnamed: 0,thresholds,precision,recall,fscore,roc_auc_score
mean,0.246034,"(0.5871559633027523,)",0.783673,"(0.6713286713286712,)",0.943073
median,0.246034,"(0.5871559633027523,)",0.783673,"(0.6713286713286712,)",0.943073
max,0.34122,"(0.7008547008547008,)",0.669388,"(0.6847599164926931,)",0.944736
