In [1]:
import pandas as pd
import numpy as np

# предобработка текстов
import re
import numpy as np
from gensim.corpora.dictionary import Dictionary
from razdel import tokenize  # сегментация русскоязычного текста на токены и предложения https://github.com/natasha/razdel
import pymorphy2  # Морфологический анализатор

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KristinaS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
stopword_ru = stopwords.words('russian')
print(len(stopword_ru))

151


Ячейки с измененными данными (в соответствии с ДЗ) я отметила

In [3]:
news = pd.read_csv("../ML_в_бизнесе/Lection2/articles.csv")
# ("../ML_в_бизнесе/Lection2/articles.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [4]:
users = pd.read_csv("../ML_в_бизнесе/Lection2/users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [5]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    text = re.sub('n', ' ', text)
    
    return text

cache = {}
morph = pymorphy2.MorphAnalyzer()

def lemmatization(text):    
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист лемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w) > 1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
                
                
    words_lem_without_stopwords = [i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [6]:
morph = pymorphy2.MorphAnalyzer()
morph.parse('сбегали')[0].normal_form

'сбегать'

In [7]:
news['title'].iloc[:2].apply(lambda x: clean_text(x))

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)


0    заместитель председателя правительства рф серг...
1    матч  финала кубка россии по футболу был приос...
Name: title, dtype: object

In [8]:
%%time
from tqdm import tqdm
tqdm.pandas()

# Запускаем очистку текста. Будет долго...
news['title'] = news['title'].progress_apply(lambda x: clean_text(x))

100%|███████████████████████████████████████████████████████████████████████████| 27000/27000 [00:56<00:00, 481.36it/s]


Wall time: 56.2 s


In [9]:
news['title'].iloc[:2].apply(lambda x: lemmatization(x))

0    [заместитель, председатель, правительство, рф,...
1    [матч, финал, кубок, россия, футбол, приостано...
Name: title, dtype: object

In [10]:
%%time
# Запускаем лемматизацию текста. Будет очень долго...
news['title'] = news['title'].progress_apply(lambda x: lemmatization(x))

100%|████████████████████████████████████████████████████████████████████████████| 27000/27000 [05:25<00:00, 83.04it/s]


Wall time: 5min 25s


In [11]:
# сформируем список наших текстов
texts = list(news['title'].values)

# Создадим корпус из списка с текстами
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [12]:
N_topic = 15

In [13]:
%%time
from gensim.models import LdaModel

# Обучаем модель на корпусе
lda = LdaModel(common_corpus, num_topics=N_topic, id2word=common_dictionary)#, passes=10)

Wall time: 59.7 s


In [14]:
from gensim.test.utils import datapath

# Сохраняем модель на диск
temp_file = datapath("model.lda")
lda.save(temp_file)

In [15]:
# Загружаем обученную модель с диска
lda = LdaModel.load(temp_file)

In [16]:
# Создаем новый корпус документов, которые раньше не видели
other_texts = list(news['title'].iloc[:3])
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[2]
print(other_texts[2])
lda[unseen_doc]

['форвард', 'авангард', 'томаш', 'заборский', 'прокомментировать', 'игра', 'свой', 'команда', 'матч', 'чемпионат', 'кхл', 'против', 'атланта', 'провести', 'плохой', 'матч', 'нижний', 'новгород', 'против', 'торпедо', 'настраиваться', 'первый', 'минута', 'включиться', 'работа', 'сказать', 'заборский', 'получиться', 'забросить', 'быстрый', 'гол', 'задать', 'хороший', 'темп', 'поединок', 'мочь', 'играть', 'ещё', 'хороший', 'сторона', 'пять', 'очко', 'выезд', 'девять', 'это', 'хороший']


[(0, 0.46746066), (5, 0.18377885), (12, 0.3305429)]

In [17]:
x = lda.show_topics(num_topics=N_topic, num_words=7, formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

# Печатаем только слова
for topic, words in topics_words:
    print(f"topic_{topic}: " + " ".join(words))

topic_0: год млн исследование компания тыс рынок составить
topic_1: брюссель заполнить компенсировать соль осипов ле южнокорейский
topic_2: китай китайский японский фестиваль япония открытие испытание
topic_3: статья гражданин это год банк район ракета
topic_4: ребёнок мужчина человек обнаружить женщина лечение страдать
topic_5: корабль экипаж восток море польша флот космос
topic_6: газ украина президент совет депутат россия государственный
topic_7: вицепремьер участок парламент торговый снять год следствие
topic_8: поверхность городской следствие билет грунт бежать орудие
topic_9: дональд иран белоруссия трамп белорусский израиль рот
topic_10: самолёт военный земля это год путин исследование
topic_11: рубль фонд компания пациент миссия исследование орган
topic_12: год который это человек свой время стать
topic_13: это который год россия российский мочь страна
topic_14: год это который млрд весь мочь новый


In [18]:
def get_lda_vector(lda, text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]

    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(N_topic):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)


get_lda_vector(lda, news['title'].iloc[0])

array([0.        , 0.        , 0.06187318, 0.1502647 , 0.        ,
       0.        , 0.78036541, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [19]:
%%time
topic_matrix = pd.DataFrame([get_lda_vector(lda, text) for text in news['title'].values])
topic_matrix.columns = [f'topic_{i}' for i in range(N_topic)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+[f'topic_{i}' for i in range(N_topic)]]
topic_matrix.head(5)

Wall time: 59.6 s


Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14
0,6,0.0,0.0,0.061871,0.150308,0.0,0.0,0.780324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4896,0.0,0.0,0.0,0.0,0.0,0.0,0.068376,0.0,0.213526,0.0,0.56132,0.138416,0.0,0.0,0.0
2,4897,0.467379,0.0,0.0,0.0,0.0,0.183845,0.0,0.0,0.0,0.0,0.0,0.0,0.330558,0.0,0.0
3,4898,0.150416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.401135,0.02401,0.402675,0.0,0.013665
4,4899,0.0,0.0,0.0,0.401941,0.0,0.0,0.470943,0.0,0.104237,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[[f'topic_{i}' for i in range(N_topic)]].values))

### Задание 2

Модифицировать код функции get_user_embedding таким образом, чтобы считалось не среднее (как в примере np.mean), а медиана. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: roc auc, precision/recall/f_score (для 3 последних - подобрать оптимальный порог)

In [21]:
def get_user_embedding(user_articles_list, doc_dict):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    # print(user_vector)
    user_vector = np.median(user_vector, 0) # устанавливаю np.median  
    return user_vector

In [22]:
%%time
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x, doc_dict))])
user_embeddings.columns = [f'topic_{i}' for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+[f'topic_{i}' for i in range(N_topic)]]
user_embeddings.head(3)

Wall time: 1.96 s


Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14
0,u105138,0.0,0.0,0.029418,0.018126,0.065022,0.0,0.0,0.0,0.006081,0.0,0.0,0.0,0.299455,0.259485,0.0
1,u108690,0.0,0.0,0.0,0.030757,0.01294,0.0,0.07758,0.0,0.0,0.0,0.062704,0.037476,0.189925,0.393669,0.054852
2,u108339,0.0,0.0,0.015531,0.110373,0.04334,0.0,0.044701,0.035438,0.0,0.0,0.021436,0.030922,0.260389,0.209112,0.078351


In [23]:
target = pd.read_csv("../ML_в_бизнесе/Lection2/users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [24]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,churn
0,u105138,0.0,0.0,0.029418,0.018126,0.065022,0.0,0.0,0.0,0.006081,0.0,0.0,0.0,0.299455,0.259485,0.0,0
1,u108690,0.0,0.0,0.0,0.030757,0.01294,0.0,0.07758,0.0,0.0,0.0,0.062704,0.037476,0.189925,0.393669,0.054852,1
2,u108339,0.0,0.0,0.015531,0.110373,0.04334,0.0,0.044701,0.035438,0.0,0.0,0.021436,0.030922,0.260389,0.209112,0.078351,1


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [26]:
# разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[[f'topic_{i}' for i in range(N_topic)]], 
                                                    X['churn'], random_state=0)

In [27]:
logreg = LogisticRegression()
# обучим 
logreg.fit(X_train, y_train)

LogisticRegression()

In [28]:
# наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]
preds[:10]

array([0.12987169, 0.11688025, 0.45130857, 0.20136103, 0.03317757,
       0.0254017 , 0.07053128, 0.06238051, 0.03573122, 0.42298046])

In [29]:
from sklearn.metrics import (f1_score, roc_auc_score, precision_score,
                             classification_report, precision_recall_curve)

In [30]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}') 

Best Threshold=0.31459785073789154, F-Score=0.711, Precision=0.768, Recall=0.661


In [31]:
roc_auc_med = roc_auc_score(y_test, preds)

In [32]:
df = pd.DataFrame({'metric': 'median',
                  'threshold': [round(thresholds[ix], 3)],
                 'fscore': [round(fscore[ix], 3)],
                 'precision': [round(precision[ix], 3)],
                 'recall': [round(recall[ix], 3)],
                 'roc_auc': [round(roc_auc_med, 3)]})

df

Unnamed: 0,metric,threshold,fscore,precision,recall,roc_auc
0,median,0.315,0.711,0.768,0.661,0.953


### Задание 3

Повторить п.2, но используя уже не медиану, а max

In [33]:
def get_user_embedding_max(user_articles_list, doc_dict):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    # print(user_vector)
    user_vector = np.max(user_vector, 0) # устанавливаю np.max  
    return user_vector

In [34]:
%%time
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_max(x, doc_dict))])
user_embeddings.columns = [f'topic_{i}' for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+[f'topic_{i}' for i in range(N_topic)]]
user_embeddings.head(3)

Wall time: 689 ms


Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14
0,u105138,0.369865,0.0,0.153371,0.126128,0.09144,0.043066,0.322285,0.0,0.101754,0.043227,0.062312,0.098968,0.577381,0.664302,0.113818
1,u108690,0.054931,0.0,0.0,0.164977,0.133083,0.0,0.180789,0.013216,0.0,0.016101,0.178757,0.130983,0.364404,0.684502,0.161349
2,u108339,0.076923,0.0,0.076127,0.20152,0.126839,0.02605,0.102125,0.234904,0.0,0.0,0.111621,0.088745,0.543811,0.559729,0.112806


In [35]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,churn
0,u105138,0.369865,0.0,0.153371,0.126128,0.09144,0.043066,0.322285,0.0,0.101754,0.043227,0.062312,0.098968,0.577381,0.664302,0.113818,0
1,u108690,0.054931,0.0,0.0,0.164977,0.133083,0.0,0.180789,0.013216,0.0,0.016101,0.178757,0.130983,0.364404,0.684502,0.161349,1
2,u108339,0.076923,0.0,0.076127,0.20152,0.126839,0.02605,0.102125,0.234904,0.0,0.0,0.111621,0.088745,0.543811,0.559729,0.112806,1


In [36]:
# разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[[f'topic_{i}' for i in range(N_topic)]], 
                                                    X['churn'], random_state=0)

In [37]:
logreg = LogisticRegression()
# обучим 
logreg.fit(X_train, y_train)

# наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]
preds[:10]

array([5.76974553e-02, 1.29091418e-04, 7.73617501e-01, 1.44805780e-01,
       8.03315307e-03, 8.66434785e-03, 1.17758976e-02, 5.48572595e-02,
       2.70248653e-02, 4.32027070e-01])

In [38]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}') 

Best Threshold=0.30800359643644537, F-Score=0.745, Precision=0.703, Recall=0.792


In [39]:
roc_auc_med = roc_auc_score(y_test, preds)

In [40]:
df = df.append({'metric': 'max',
                  'threshold': round(thresholds[ix], 3),
                 'fscore': round(fscore[ix], 3),
                 'precision': round(precision[ix], 3),
                 'recall': round(recall[ix], 3),
                 'roc_auc': round(roc_auc_med, 3)}, ignore_index=True)

df

Unnamed: 0,metric,threshold,fscore,precision,recall,roc_auc
0,median,0.315,0.711,0.768,0.661,0.953
1,max,0.308,0.745,0.703,0.792,0.96


### Задания 5, 6

Сформировать на выходе единую таблицу, сравнивающую качество 2/3 разных метода получения эмбедингов пользователей: median, max, idf_mean по метрикам roc_auc, precision, recall, f_score.
Сделать самостоятельные выводы и предположения о том, почему тот или ной способ оказался эффективнее остальных

ОТВЕТ: Вероятно я что-то в код лишнее добавила/убавила т.к. разницы особой нет между max и median, но я бы предположила, что с median  должны были выйти лучше показатели. Медиана находится в центре среди возможных вариантов, т.е. представляет нам более менее объективную картину, в отличии от max, который может нам завышать вероятность или предполагаемый интерес пользователя к определённым тема.