In [8]:
import pandas as pd

In [9]:
news = pd.read_csv("data/articles.csv")

In [10]:
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [7]:
users = pd.read_csv("data/users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [12]:
import re
import numpy as np
from gensim.corpora.dictionary import Dictionary
from razdel import tokenize
import pymorphy2

In [13]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\grayni\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [14]:
stopword_ru = stopwords.words('russian')
print(len(stopword_ru))

151


In [16]:
with open('data/stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]

stopword_ru += additional_stopwords
len(stopword_ru)

776

In [18]:
def clean_text(text):
    '''
    очистка текста

    на выходе очищеный текст
    '''
    if not isinstance(text, str):
        text = str(text)

    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    text = re.sub('n', ' ', text)

    return text

cache = {}
morph = pymorphy2.MorphAnalyzer()

def lemmatization(text):
    # [0]
    if not isinstance(text, str):
        text = str(text)

    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w) > 1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)

    words_lem_without_stopwords = [i for i in words_lem if not i in stopword_ru] # [6]

    return words_lem_without_stopwords

In [19]:
from tqdm import tqdm
tqdm.pandas()

# Запускаем очистку текста. Будет долго...
news['title'] = news['title'].progress_apply(lambda x: clean_text(x))

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
100%|██████████| 27000/27000 [00:23<00:00, 1132.30it/s]


In [20]:
news['title'].iloc[:10]

0    заместитель председателя правительства рф серг...
1    матч  финала кубка россии по футболу был приос...
2    форвард авангарда томаш заборский прокомментир...
3    главный тренер кубани юрий красножан прокоммен...
4    решением попечительского совета владивостокско...
5    ио главного тренера вячеслав буцаев прокоммент...
6    запорожский металлург дома потерпел разгромное...
7    сборная сша одержала победу над австрией со сч...
8    бывший защитник сборной россии дарюс каспарайт...
9    полузащитник цска зоран тошич после победы над...
Name: title, dtype: object

In [21]:
news['title'] = news['title'].progress_apply(lambda x: lemmatization(x))

100%|██████████| 27000/27000 [02:31<00:00, 178.42it/s]


In [22]:
texts = list(news['title'].values)

common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [23]:
N_topic = 20

from gensim.models import LdaModel

lda = LdaModel(common_corpus, num_topics=N_topic, id2word=common_dictionary, passes=2)  # можно было менять

In [28]:
from gensim.test.utils import datapath

temp_file = datapath("model.lda")
lda.save(temp_file)

In [29]:
lda = LdaModel.load(temp_file)

In [30]:
other_texts = list(news['title'].iloc[:3])
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[2]
print(other_texts[2])
lda[unseen_doc]

['форвард', 'авангард', 'томаш', 'заборский', 'прокомментировать', 'игра', 'свой', 'команда', 'матч', 'чемпионат', 'кхл', 'против', 'атланта', 'провести', 'плохой', 'матч', 'нижний', 'новгород', 'против', 'торпедо', 'настраиваться', 'первый', 'минута', 'включиться', 'работа', 'сказать', 'заборский', 'получиться', 'забросить', 'быстрый', 'гол', 'задать', 'хороший', 'темп', 'поединок', 'мочь', 'играть', 'ещё', 'хороший', 'сторона', 'пять', 'очко', 'выезд', 'девять', 'это', 'хороший']


[(4, 0.4294308), (5, 0.021762574), (7, 0.33011806), (19, 0.20070541)]

In [31]:
x = lda.show_topics(num_topics=N_topic, num_words=7, formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

for topic, words in topics_words:
    print(f"topic_{topic}: " + " ".join(words))

topic_0: произойти тело пострадать который район данные человек
topic_1: ребёнок гражданин мозг смерть пенсия семья родитель
topic_2: сша американский это земля торговый россия воздух
topic_3: исследование год день россиянин составить пациент рост
topic_4: это который мочь год всё человек весь
topic_5: космос лаборатория вирус фрагмент атмосферный студия сражение
topic_6: год рубль млрд компания россия это который
topic_7: поверхность журнал девочка команда игра грунт тур
topic_8: россия это страна который российский украина власть
topic_9: банк научный выяснить год русский век эксперимент
topic_10: год который стать это тыс первый также
topic_11: дело который человек год обнаружить сотрудник это
topic_12: суд египет иск решение долг судья подать
topic_13: город станция армия житель спрос около строительство
topic_14: статья взрыв убийство писать следователь управление ск
topic_15: рейс перевод бомба опасаться ступень консультация экран
topic_16: женщина мужчина исследование знаменитый

In [32]:
def get_lda_vector(lda, text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]

    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(N_topic):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [33]:
get_lda_vector(lda, news['title'].iloc[0])

array([0.23875836, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.14595802, 0.09634901, 0.        , 0.02650272,
       0.0132063 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.37228525, 0.10071662])

In [34]:
topic_matrix = pd.DataFrame([get_lda_vector(lda, text) for text in news['title'].values])
topic_matrix.columns = [f'topic_{i}' for i in range(N_topic)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+[f'topic_{i}' for i in range(N_topic)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,6,0.238765,0.0,0.0,0.0,0.0,0.0,0.145912,0.096348,0.0,...,0.013206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.372321,0.100719
1,4896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.379407,0.318615,...,0.0,0.0,0.036919,0.0,0.245045,0.0,0.0,0.0,0.0,0.0
2,4897,0.0,0.0,0.0,0.0,0.429359,0.021762,0.0,0.330131,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.200766
3,4898,0.0,0.0,0.0,0.0,0.61023,0.0,0.0,0.16883,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065357,0.145922
4,4899,0.30137,0.0,0.0,0.0,0.0,0.0,0.0,0.111599,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.563396,0.0


In [35]:
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [36]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[[f'topic_{i}' for i in range(N_topic)]].values))

In [37]:
doc_dict[293672]

array([0.        , 0.        , 0.        , 0.18820895, 0.06731514,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.18692432, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.06439443, 0.        , 0.47647601, 0.        ])

In [38]:
def get_user_embedding(user_articles_list, doc_dict, func):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = func(user_vector, axis=0)
    return user_vector

In [39]:
user_articles_list = users['articles'].iloc[33]

get_user_embedding(user_articles_list, doc_dict, np.mean)

array([0.        , 0.01429783, 0.05381401, 0.01493393, 0.14904411,
       0.        , 0.10404253, 0.        , 0.19246655, 0.01335961,
       0.04440395, 0.1678235 , 0.        , 0.00827993, 0.01391575,
       0.        , 0.00508542, 0.        , 0.11934482, 0.08842133])

In [40]:
FUNC = np.mean
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x, doc_dict, FUNC))])
user_embeddings.columns = [f'topic_{i}' for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+[f'topic_{i}' for i in range(N_topic)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,u105138,0.04177,0.103931,0.018125,0.054616,0.099142,0.0,0.013547,0.0064,0.050318,...,0.138004,0.143737,0.026443,0.034292,0.014694,0.0,0.022689,0.011786,0.194848,0.005331
1,u108690,0.045494,0.023491,0.047143,0.037903,0.19059,0.0,0.065007,0.004667,0.272725,...,0.035827,0.152271,0.004519,0.0,0.016855,0.0,0.004742,0.0,0.066346,0.014194
2,u108339,0.091291,0.011154,0.039881,0.051669,0.091094,0.0,0.057818,0.0,0.167718,...,0.046295,0.228094,0.0,0.038629,0.032466,0.0,0.0,0.0,0.042397,0.069383


In [41]:
target = pd.read_csv("data/users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [42]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,churn
0,u105138,0.04177,0.103931,0.018125,0.054616,0.099142,0.0,0.013547,0.0064,0.050318,...,0.143737,0.026443,0.034292,0.014694,0.0,0.022689,0.011786,0.194848,0.005331,0
1,u108690,0.045494,0.023491,0.047143,0.037903,0.19059,0.0,0.065007,0.004667,0.272725,...,0.152271,0.004519,0.0,0.016855,0.0,0.004742,0.0,0.066346,0.014194,1
2,u108339,0.091291,0.011154,0.039881,0.051669,0.091094,0.0,0.057818,0.0,0.167718,...,0.228094,0.0,0.038629,0.032466,0.0,0.0,0.0,0.042397,0.069383,1


In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X[[f'topic_{i}' for i in range(N_topic)]],
                                                    X['churn'], random_state=0)

In [45]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [46]:
preds = model.predict_proba(X_test)[:, 1]
preds[:10]

array([0.09366667, 0.01407878, 0.49154493, 0.37425184, 0.02624316,
       0.0304149 , 0.23838174, 0.08095907, 0.02342652, 0.1755155 ])

In [47]:
metrics_df = pd.DataFrame(columns=['model', 'thresh', 'F-Score', 'Precision', 'Recall', 'ROC AUC'])
metrics_df

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC


In [48]:
from sklearn.metrics import (f1_score, roc_auc_score, precision_score,
                             classification_report, precision_recall_curve, confusion_matrix)

In [49]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.2751836838052241, F-Score=0.759, Precision=0.686, Recall=0.849


In [50]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.9707494621780337

In [51]:
metrics_df = metrics_df.append({
    'model': FUNC.__name__,
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc
}, ignore_index=True)

metrics_df

  metrics_df = metrics_df.append({


Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,mean,0.275184,0.759124,0.686469,0.84898,0.970749


In [52]:
FUNC = np.median

In [53]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x, doc_dict, FUNC))])
user_embeddings.columns = [f'topic_{i}' for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+[f'topic_{i}' for i in range(N_topic)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,u105138,0.0,0.017814,0.0,0.006068,0.033658,0.0,0.0,0.0,0.0,...,0.093462,0.166451,0.0,0.02047,0.0,0.0,0.0,0.0,0.186276,0.0
1,u108690,0.0,0.0,0.024844,0.0,0.160143,0.0,0.02809,0.0,0.226237,...,0.034982,0.135122,0.0,0.0,0.006781,0.0,0.0,0.0,0.037089,0.0084
2,u108339,0.102911,0.0,0.01146,0.013131,0.078116,0.0,0.015732,0.0,0.150656,...,0.035342,0.24405,0.0,0.0444,0.035503,0.0,0.0,0.0,0.011616,0.070471


In [54]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,churn
0,u105138,0.0,0.017814,0.0,0.006068,0.033658,0.0,0.0,0.0,0.0,...,0.166451,0.0,0.02047,0.0,0.0,0.0,0.0,0.186276,0.0,0
1,u108690,0.0,0.0,0.024844,0.0,0.160143,0.0,0.02809,0.0,0.226237,...,0.135122,0.0,0.0,0.006781,0.0,0.0,0.0,0.037089,0.0084,1
2,u108339,0.102911,0.0,0.01146,0.013131,0.078116,0.0,0.015732,0.0,0.150656,...,0.24405,0.0,0.0444,0.035503,0.0,0.0,0.0,0.011616,0.070471,1


In [56]:
X_train, X_test, y_train, y_test = train_test_split(X[[f'topic_{i}' for i in range(N_topic)]],
                                                    X['churn'], random_state=0)

In [57]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [58]:
preds = model.predict_proba(X_test)[:, 1]
preds[:10]

array([0.05699217, 0.01525824, 0.52866363, 0.42795534, 0.03089585,
       0.02084406, 0.20874097, 0.10475765, 0.04423489, 0.18824316])

In [59]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.28001893182304743, F-Score=0.760, Precision=0.690, Recall=0.845


In [60]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.9698912727484156

In [61]:
metrics_df = metrics_df.append({
    'model': FUNC.__name__,
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc
}, ignore_index=True)

metrics_df

  metrics_df = metrics_df.append({


Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,mean,0.275184,0.759124,0.686469,0.84898,0.970749
1,median,0.280019,0.759633,0.69,0.844898,0.969891


In [62]:
FUNC = np.max

In [63]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x, doc_dict, FUNC))])
user_embeddings.columns = [f'topic_{i}' for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+[f'topic_{i}' for i in range(N_topic)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,u105138,0.159162,0.4393,0.108749,0.188209,0.404691,0.0,0.081281,0.038397,0.183592,...,0.382358,0.314741,0.097943,0.09952,0.088165,0.0,0.071738,0.070717,0.476476,0.031987
1,u108690,0.212141,0.108386,0.138698,0.156313,0.396425,0.0,0.194795,0.027999,0.601681,...,0.058026,0.416795,0.027114,0.0,0.044091,0.0,0.015492,0.0,0.163125,0.037346
2,u108339,0.129406,0.045321,0.136584,0.206527,0.152946,0.0,0.233837,0.0,0.429764,...,0.101319,0.311925,0.0,0.084851,0.067657,0.0,0.0,0.0,0.205979,0.140149


In [64]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,churn
0,u105138,0.159162,0.4393,0.108749,0.188209,0.404691,0.0,0.081281,0.038397,0.183592,...,0.314741,0.097943,0.09952,0.088165,0.0,0.071738,0.070717,0.476476,0.031987,0
1,u108690,0.212141,0.108386,0.138698,0.156313,0.396425,0.0,0.194795,0.027999,0.601681,...,0.416795,0.027114,0.0,0.044091,0.0,0.015492,0.0,0.163125,0.037346,1
2,u108339,0.129406,0.045321,0.136584,0.206527,0.152946,0.0,0.233837,0.0,0.429764,...,0.311925,0.0,0.084851,0.067657,0.0,0.0,0.0,0.205979,0.140149,1


In [66]:
X_train, X_test, y_train, y_test = train_test_split(X[[f'topic_{i}' for i in range(N_topic)]],
                                                    X['churn'], random_state=0)

In [67]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [68]:
preds = model.predict_proba(X_test)[:, 1]
preds[:10]

array([0.05290277, 0.00244221, 0.83658498, 0.20743705, 0.00798451,
       0.00365163, 0.02978382, 0.00487938, 0.00963826, 0.27515678])

In [69]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.38705257993208303, F-Score=0.750, Precision=0.780, Recall=0.722


In [70]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.9688470259898831

In [71]:
metrics_df = metrics_df.append({
    'model': FUNC.__name__,
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc
}, ignore_index=True)

metrics_df

  metrics_df = metrics_df.append({


Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,mean,0.275184,0.759124,0.686469,0.84898,0.970749
1,median,0.280019,0.759633,0.69,0.844898,0.969891
2,amax,0.387053,0.75,0.779736,0.722449,0.968847


In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [73]:
users['articles_str'] = users['articles'].apply(lambda x: x.replace('[','').replace(']', '').replace(',', ''))

users['articles_str'].iloc[0]

'293672 293328 293001 293622 293126 1852'

In [74]:
tfidf = TfidfVectorizer()
tfidf.fit(users['articles_str'])

In [75]:
idf = pd.DataFrame({'article_id': tfidf.get_feature_names_out(),
                    'idf': tfidf.idf_})

idf

Unnamed: 0,article_id,idf
0,10,8.888710
1,100,7.907880
2,1000,8.041412
3,1001,8.888710
4,1002,8.888710
...,...,...
14776,995,8.377884
14777,996,8.195562
14778,997,8.601027
14779,998,9.294175


In [77]:
def get_user_embedding_idf(user_articles_list, doc_dict):
    user_articles_list = eval(user_articles_list)

    user_vector = np.zeros((len(user_articles_list), N_topic))
    for i, doc_id in enumerate(user_articles_list):
        try:
            weight = idf[idf['article_id'] == str(doc_id)]['idf'].values[0]
        except Exception as e:
            weight = 0
        user_vector[i] = doc_dict[doc_id] * weight

    user_vector = np.median(user_vector, axis=0)
    return user_vector

In [78]:
from tqdm import tqdm
tqdm.pandas()

user_embeddings = pd.DataFrame([i for i in users['articles'].progress_apply(lambda x: get_user_embedding_idf(x, doc_dict))])
user_embeddings.columns = [f'topic_{i}' for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+[f'topic_{i}' for i in range(N_topic)]]
user_embeddings.head(3)

100%|██████████| 8000/8000 [00:47<00:00, 167.17it/s]


Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,u105138,0.0,0.153219,0.0,0.052194,0.28949,0.0,0.0,0.0,0.0,...,0.803871,1.45958,0.0,0.190252,0.0,0.0,0.0,0.0,1.676322,0.0
1,u108690,0.0,0.0,0.225908,0.0,1.332825,0.0,0.245006,0.0,1.946164,...,0.306013,1.201056,0.0,0.0,0.055576,0.0,0.0,0.0,0.303967,0.074662
2,u108339,0.82659,0.0,0.096009,0.112939,0.629317,0.0,0.131799,0.0,1.197672,...,0.296091,2.103434,0.0,0.361193,0.299243,0.0,0.0,0.0,0.0952,0.62304


In [79]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,churn
0,u105138,0.0,0.153219,0.0,0.052194,0.28949,0.0,0.0,0.0,0.0,...,1.45958,0.0,0.190252,0.0,0.0,0.0,0.0,1.676322,0.0,0
1,u108690,0.0,0.0,0.225908,0.0,1.332825,0.0,0.245006,0.0,1.946164,...,1.201056,0.0,0.0,0.055576,0.0,0.0,0.0,0.303967,0.074662,1
2,u108339,0.82659,0.0,0.096009,0.112939,0.629317,0.0,0.131799,0.0,1.197672,...,2.103434,0.0,0.361193,0.299243,0.0,0.0,0.0,0.0952,0.62304,1


In [81]:
X_train, X_test, y_train, y_test = train_test_split(X[[f'topic_{i}' for i in range(N_topic)]],
                                                    X['churn'], random_state=0)

In [82]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [83]:
preds = model.predict_proba(X_test)[:, 1]
preds[:10]

array([1.14320246e-02, 1.28832784e-04, 9.63364573e-01, 1.56180150e-01,
       3.22632822e-04, 2.31946895e-04, 1.78272113e-01, 8.80492753e-04,
       1.81616140e-03, 1.32141922e-01])

In [84]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.508834867536842, F-Score=0.886, Precision=0.899, Recall=0.873


In [85]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.9901482644339787

In [86]:
metrics_df = metrics_df.append({
    'model': 'idf_median',
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc
}, ignore_index=True)

  metrics_df = metrics_df.append({


In [87]:
metrics_df

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,mean,0.275184,0.759124,0.686469,0.84898,0.970749
1,median,0.280019,0.759633,0.69,0.844898,0.969891
2,amax,0.387053,0.75,0.779736,0.722449,0.968847
3,idf_median,0.508835,0.886128,0.89916,0.873469,0.990148
