In [1]:
import pandas as pd

Загрузим сами новости, список пользователей и прочитанных ими статей

In [2]:
news = pd.read_csv("articles.csv")
users = pd.read_csv("users_articles.csv")

### 1. Получаем векторные представления новостей

Импортимруем библиотеки

In [3]:
from gensim.corpora.dictionary import Dictionary
import gensim.downloader as api
import re
import numpy as np
from nltk.corpus import stopwords
from razdel import tokenize
import pymorphy2
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/maxim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Введем файл со словами, которые не нужно учитывать

In [4]:
stopword_ru = stopwords.words('russian')
morph = pymorphy2.MorphAnalyzer()

len(stopword_ru)

151

In [5]:
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords
len(stopword_ru)

776

Функция для очистки текста

In [6]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())

    return text

Функция для лемматизации текста

In [7]:
cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

Запустим очистку текста

In [8]:
%%time
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)


CPU times: user 57.5 s, sys: 1.7 s, total: 59.2 s
Wall time: 1min


Запустим лемматизацию текста

In [9]:
%%time
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

CPU times: user 5min 45s, sys: 1.08 s, total: 5min 46s
Wall time: 5min 51s


Обучим модель

In [10]:
# Сформируем список наших текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

# Создадим из этих списков словарь наших слов и вектор (корпус).
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [11]:
%%time
from gensim.models import LdaModel, TfidfModel

# Натренируем модель на векторе.
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary, passes=10)

CPU times: user 23min 23s, sys: 13min 15s, total: 36min 39s
Wall time: 13min 33s


Напишем функцию для получения вектора слова

In [12]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

Составим с помощью этой функции матрицу вероятностей тем по словам

In [13]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.0,0.0,0.0,0.0,0.031671,0.0,0.0,0.736908,...,0.029984,0.0,0.0115,0.020459,0.0,0.0,0.0,0.0,0.075003,0.0
1,4896,0.0,0.0,0.107559,0.0,0.0,0.365533,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.033491,0.276519,0.0,0.0,0.0
2,4897,0.0,0.0,0.103843,0.189797,0.0,0.155056,0.082157,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063383,0.276263,0.044162
3,4898,0.0,0.0,0.024096,0.386202,0.0,0.070426,0.0,0.0,0.091724,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105935,0.177827
4,4899,0.0,0.0,0.0,0.0,0.0,0.0,0.08754,0.0,0.524436,...,0.198515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083725,0.0


### 2. Следующий шаг - векторные представления пользователей

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import itertools

import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

%matplotlib inline

In [15]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

Реализуем три функции. Работу они будут выполнять одинаковую, но в качестве меры мы возьмем среднее, медиану и максимальное значение, чтобы потом их сравнить.

In [16]:
def get_user_embedding_mean(user_articles_list):
    user_articles_list_ = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list_])
    user_vector = np.mean(user_vector, 0)
    return user_vector

In [17]:
def get_user_embedding_median(user_articles_list):
    user_articles_list_ = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list_])
    user_vector = np.median(user_vector, 0)
    return user_vector

In [18]:
def get_user_embedding_max(user_articles_list):
    user_articles_list_ = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list_])
    user_vector = np.max(user_vector, 0)
    return user_vector

Сормируем датасет, чтобы продолжить с ним работать

In [19]:
def get_data_for_model(user_vector_type):
    user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: user_vector_type(x), 1)])
    user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
    user_embeddings['uid'] = users['uid'].values
    user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]

    target = pd.read_csv("users_churn.csv")

    X = pd.merge(user_embeddings, target, 'left')
    return X

Подготовим данные для модели, разобьем их на тестовую и обучающую выборки.

In [20]:
def model_processing(X):
    
    #разделим данные на train/test
    X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)
    logreg = LogisticRegression()
    #обучим 
    logreg.fit(X_train, y_train)
    
    #наши прогнозы для тестовой выборки
    preds = logreg.predict_proba(X_test)[:, 1]
    
    return y_test, preds

Посчитаем три раза для тех разных функций

In [21]:
X_mean = get_data_for_model(get_user_embedding_mean)
X_median = get_data_for_model(get_user_embedding_median)
X_max = get_data_for_model(get_user_embedding_max)

In [22]:
model_mean_train, model_mean_test = model_processing(X_mean)
model_median_train, model_median_test = model_processing(X_median)
model_max_train, model_max_test = model_processing(X_max)

### Рассчитаем Precision, Recall, F_score, Roc_Auc

In [23]:
def calc_results(y_test, preds):
    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = (2 * precision * recall) / (precision + recall)
    # locate the index of the largest f score
    ix = np.argmax(fscore)
    roc_auc = roc_auc_score(y_test, preds)
    
    return {'F-score': fscore[ix], 'Precision': precision[ix], 'Recall': recall[ix], 'Roc-Auc Score': roc_auc}

Получим 4 метрики для 3 разных способов и выведем результаты в таблицу

In [24]:
res_dict_mean = calc_results(model_mean_train, model_mean_test)
res_dict_median = calc_results(model_median_train, model_median_test)
res_dict_max = calc_results(model_max_train, model_max_test)

In [25]:
table = pd.DataFrame({
    'Mean': pd.Series(res_dict_mean),
    'Median': pd.Series(res_dict_median),
    'Max': pd.Series(res_dict_max),
})

table

Unnamed: 0,Mean,Median,Max
F-score,0.807611,0.830579,0.786427
Precision,0.837719,0.841004,0.769531
Recall,0.779592,0.820408,0.804082
Roc-Auc Score,0.977692,0.979057,0.977187


На основе этих результатов можно сделать вывод, что лучше всего отработала и дала лучшие показатели функция, которая использовала в своих вычислениях медианное значение при вычислении векторного представления пользователя. Вероятно это объясняется тем, что медиана более чувствительна к выбросам и нулевым значениям.