### Задания
1. Модифицировать код функции get_user_embedding таким образом, чтобы считалось не среднее (как в примере np.mean), а медиана. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: roc auc, precision/recall/f_score (для 3 последних - подобрать оптимальный порог с помощью precision_recall_curve, как это делалось на уроке)
2. Повторить п.2, но используя уже не медиану, а max
3. (опциональное, если очень хочется) Воспользовавшись полученными знаниями из п.1, повторить пункт 2, но уже взвешивая новости по tfidf (подсказка: нужно получить веса-коэффициенты для каждого документа. Не все документы одинаково информативны и несут какой-то положительный сигнал). Подсказка 2 - нужен именно idf, как вес.
4. Сформировать на выходе единую таблицу, сравнивающую качество 3 разных метода получения эмбедингов пользователей: mean, median, max, idf_mean по метрикам roc_auc, precision, recall, f_score
5. Сделать самостоятельные выводы и предположения о том, почему тот или ной способ оказался эффективнее остальных

In [1]:
import numpy as np
import pandas as pd
import re
from ast import literal_eval

from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

from nltk.corpus import stopwords
from razdel import tokenize
import pymorphy2

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve

In [2]:
news = pd.read_csv('./materials.csv')
news.head(3)

Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [3]:
users = pd.read_csv('./users_articles.csv')
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [4]:
morph = pymorphy2.MorphAnalyzer()

stopword_ru = stopwords.words('russian')

with open('./stopwords.txt', encoding='UTF-8') as f:
    additional_stopwords = [w.strip() for w in f.readlines()]
    
stopword_ru += additional_stopwords

In [5]:
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
        
    text = text.lower()
    test = text.strip('\n').strip('\r').strip('\t')
    text = re.sub('\s+', ' ', re.sub('[\d\W\^s-]+|\t|\n|\s|\r|n', ' ', text))
    
    return text

In [6]:
cache = {}

def lemmatization(text):
    if not isinstance(text, str):
        text = str(text)
        
    words = [w.text for w in list(tokenize(text))]
    words_lem = []
    
    for w in words:
        if w[0] == '-':
            w = w[1:]
        
        if len(w) > 1:
            if w in cache:
                words_lem.append(cache[w])
            else:
                temp_cache = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cache)
                
    lemms_wo_stopwords = [w for w in words_lem if w not in stopword_ru]
    
    return lemms_wo_stopwords

In [7]:
%%time
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

CPU times: total: 11.6 s
Wall time: 11.9 s


In [8]:
%%time
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

CPU times: total: 3min 17s
Wall time: 3min 22s


In [9]:
texts = [text for text in news['title']]

common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [10]:
lda = LdaModel(common_corpus, num_topics=22, id2word=common_dictionary)

In [11]:
x = lda.show_topics(num_topics=22, num_words=7, formatted=False)
topic_words = [(topic[0], [word[0] for word in topic[1]]) for topic in x]

for topic, words in topic_words:
    print(f'Topic {topic}: {" ".join(words)}')

Topic 0: сша американский северный свидетельствовать украина россия санкция
Topic 1: террорист виза боевик лесной римский взорваться уничтожить
Topic 2: научный университет сша станция улица житель новый
Topic 3: новый всё рынок рост система уровень американский
Topic 4: статья км данные установить тысяча час рубль
Topic 5: млрд рубль цена млн стоимость банк доход
Topic 6: болезнь девочка di смерть завод фан куба
Topic 7: мозг армия генерал форум операция территория ступень
Topic 8: пенсия убийство наука мужчина район женщина испытание
Topic 9: украина украинский российский санкция россия поток европа
Topic 10: россия проект глава министр фонд развитие путин
Topic 11: россия москва гражданин ребёнок территория российский женщина
Topic 12: китай планета остров японский китайский иск космос
Topic 13: снижение млн составить показатель тыс климат годовой
Topic 14: население миссия эксперт журнал погибнуть эффективность вирус
Topic 15: день хороший исследование всё первый мужчина большой
To

In [12]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))
    
    vector = []
    for i in range(25):
        if i not in not_null_topics:
            vector.append(0)
        else:
            vector.append(not_null_topics[i])
            
    return np.array(vector)

In [13]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title']])
topic_matrix.columns = [f'topic_{i}' for i in range(25)]
topic_matrix.insert(0, 'doc_id', news['doc_id'].values)
topic_matrix.head()

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042864,0.0,...,0.134104,0.011605,0.766913,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.477775,0.0,0.0,0.0,0.4194,0.0,0.0,0.0,0.0,0.0
2,4897,0.0,0.0,0.089531,0.0,0.0,0.0,0.0,0.0,0.0,...,0.620792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4898,0.0,0.0,0.0,0.533877,0.0,0.0,0.0,0.0,0.0,...,0.436695,0.0,0.0,0.0,0.0,0.01829,0.0,0.0,0.0,0.0
4,4899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.101799,0.0,0.871472,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[[f'topic_{i}' for i in range(25)]].values))

### Модели по среднему, медиане и максимуму

In [15]:
result = pd.DataFrame({}, columns=['best_threshold', 'f-score', 'precision', 'recall', 'roc-auc'])

def get_user_embedding(user_articles_list, func):
    user_articles_list = literal_eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = func(user_vector, 0)
    
    return user_vector

funcs = [np.mean, np.median, np.max]

for func in funcs:

    user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x, func), 1)])
    user_embeddings.columns = [f'topic_{i}' for i in range(25)]
    user_embeddings.insert(0, 'uid', users['uid'].values)

    target = pd.read_csv("users_churn.csv")
    final = pd.merge(user_embeddings, target, 'left')

    X = final[final.columns.difference(['churn', 'uid'])]
    y = final['churn']

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    logr = LogisticRegression()
    logr.fit(X_train, y_train)

    preds = logr.predict_proba(X_test)[:, 1]

    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = (2 * precision * recall) / (precision + recall)

    ix = np.argmax(fscore)

    metrics = list(map(lambda x: round(x, 2), 
                       [thresholds[ix], fscore[ix], precision[ix], recall[ix], roc_auc_score(y_test, preds)]))
    
    result.loc[func.__name__] = metrics
    
result

Unnamed: 0,best_threshold,f-score,precision,recall,roc-auc
mean,0.26,0.7,0.63,0.78,0.95
median,0.33,0.8,0.84,0.76,0.98
amax,0.36,0.79,0.77,0.8,0.97
