# подробно про логику работы см.  в файле webinar2_text_data_LDA_binary_classific.ipynb

In [None]:
!pip install razdel pymorphy2 pyLDavis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize

from razdel import tokenize
from gensim.corpora.dictionary import Dictionary
import pymorphy2  
import nltk 

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import itertools

from gensim.models import LdaModel

from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive')

%matplotlib inline
pd.set_option("display.max_rows", 6)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stopword_ru = stopwords.words('russian')

morph = pymorphy2.MorphAnalyzer() 

In [None]:
with open('/content/drive/MyDrive/ГБ/выборки для исследований/text_less_2/stopwords.txt') as f3:
    additional_stopwords = [w.strip() for w in f3.readlines() if w] # readlines - делает список где каждый эелемент строка-слово с символом переноса строки, strip() убирает символ \n
    
stopword_ru += additional_stopwords
len(stopword_ru)

776

In [None]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    text = re.sub('n', ' ', text)
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)  # так задаим очистку текста с возвратом текста, а не массива слов
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация c помощью библиотеки pymorphy2 (в else)
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    #print(tokens)
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form   # здесь включается морфоанализатор и меняет форму слов на неопределенную изменяя окончания
                words_lem.append(temp_cach)
                #print(w, ':', temp_cach)
    

    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]

    return words_lem_without_stopwords


# Пропишем алгоритм подсчета метрик для наших будущих моделей
results = pd.DataFrame(columns=['model', 'thresh', 'F-Score', 'Precision', 'Recall', 'ROC AUC'])

def compute_result(model, preds_proba, results):
    precision, recall, thresholds = precision_recall_curve(y_test, preds_proba)
    fscore = (2 * precision * recall) / (precision + recall)
    auc = roc_auc_score(y_test, preds_proba)
    ix = np.argmax(fscore)

    results = results.append({
      'model': model, # type(model['classifier']).__name__ 
      'thresh': thresholds[ix],
      'F-Score': fscore[ix],
      'Precision': precision[ix],
      'Recall': recall[ix],
      'ROC AUC': auc
      }, ignore_index=True)
    
    return results

__Воспользовавшись полученными знаниями попробуйте взвеcить новости по tfidf__ 

Подсказка 1 - нужно получить веса-коэффициенты для каждого документа. Не все документы одинаково информативны и несут какой-то положительный сигнал.<br>
Подсказка 2 - нужен именно idf, как вес.


__Решение:__

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

with open('/content/drive/MyDrive/ГБ/выборки для исследований/text_less_2/materials.csv', "rb") as f5:
    data = pd.read_csv(f5)

In [None]:
data['title'][1]

'Матч 1/16 финала Кубка России по футболу был приостановлен судьей из-за взрыва пиротехнических снарядов, передает корреспондент «Газеты.Ru». Болельщики выбросили на поле петарды. Судья увел команды с поля в подтрибунное помещение. Динамовцы ушли, а торпедовцы остались у кромки поля. Сообщается, что матч остановлен на пять минут.n«Газета.Ru» ведетnонлайн-трансляциюnэтого матча.'

Запишем новый столбец в датафрейм со статьями, где будут статьи из очищенных и лемматизированных слов

In [None]:
%%time
data['clean_title'] = data['title'].apply(lambda x: clean_text(x), 1)

CPU times: user 37.1 s, sys: 1.15 s, total: 38.3 s
Wall time: 44.1 s


In [None]:
%%time
data['clean_title'] = data['clean_title'].apply(lambda x: lemmatization(x), 1)
data.head(5)

CPU times: user 5min 21s, sys: 1.23 s, total: 5min 22s
Wall time: 5min 25s


Unnamed: 0,doc_id,title,clean_title
0,6,Заместитель председателяnправительства РФnСерг...,"[заместитель, председатель, правительство, рф,..."
1,4896,Матч 1/16 финала Кубка России по футболу был п...,"[матч, финал, кубок, россия, футбол, приостано..."
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...,"[форвард, авангард, томаш, заборский, прокомме..."
3,4898,Главный тренер «Кубани» Юрий Красножан прокомм...,"[главный, тренер, кубань, юрий, красножанин, п..."
4,4899,Решением попечительского совета владивостокско...,"[решение, попечительский, совет, владивостокск..."


In [None]:
#формируем список списков, где каждый список это перечень ранее разбитых  леммитизированных слов конкретной статьи
texts = data['clean_title'].values # вместо [t for t in data['clean_title'].values]

#делаем словарь, где ключом пронумеруем каждое слово -  indx: word
common_dictionary = Dictionary(texts)    

# преобразовываем список texts заменяя реальные слова на их номера(ключи) и по каждому слову считаем кол-во упоминаний в статье. 
# получаем список списков, где в каждом списке перечислены кортежи двух значений - номер (ключ) слова и кол-во раз его упоминаний в статье, слова отсортированы по алфавиту
common_corpus = [common_dictionary.doc2bow(text) for text in texts] 

In [None]:
common_corpus[0][:5]

[(0, 2), (1, 1), (2, 2), (3, 1), (4, 1)]

Если нужно посмотреть что за слова в кортежах, то:

In [None]:
# corpus_by_words = [[(common_dictionary[id], num) for id, num in cp] for cp in common_corpus]

# corpus_by_words[0][:5]
# _____________________
# [('александр', 2), ('алексей', 1), ('андрей', 2), ('армеец', 1), ('банк', 1)]

Формируем матрицу __topic_matrix__, где каждая статья разбивается на вектор относимости к каждой из N тем

In [None]:
n_topic = 20

lda = LdaModel(common_corpus, num_topics=n_topic, id2word=common_dictionary)

def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(n_topic):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [None]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in data['clean_title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(n_topic)]
topic_matrix['doc_id'] = data['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(n_topic)]]
topic_matrix.head(5)


Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,6,0.0,0.038294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.070097,0.0,0.0,0.442257,0.0,0.0,0.027363,0.415411,0.0,0.0
1,4896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.928738,0.0,0.0,0.0,0.0,0.0,0.0,0.049311,0.0,0.0
2,4897,0.0,0.551953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.066741,0.0,0.045129,0.0,0.27939,0.0,0.0,0.038037
3,4898,0.0,0.402021,0.0,0.0,0.0,0.0,0.0,0.0,0.26046,...,0.221027,0.0,0.0,0.012308,0.0,0.0,0.0,0.095038,0.0,0.0
4,4899,0.0,0.0,0.0,0.0,0.0,0.0,0.039759,0.0,0.0,...,0.0,0.0,0.058575,0.0,0.0,0.0,0.0,0.878693,0.0,0.0


In [None]:
# формируем список векторов каждой статьи в зависимости от принадлежности слов к каждому из n_topics
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(n_topic)]].values))

In [None]:
doc_dict[6]

array([0.        , 0.03829413, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.07009654, 0.        , 0.        , 0.44225684, 0.        ,
       0.        , 0.02736264, 0.41541097, 0.        , 0.        ])

In [None]:
with open('/content/drive/MyDrive/ГБ/выборки для исследований/text_less_2/users_articles.csv', "rb") as f2:
    users = pd.read_csv(f2)
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [None]:
# формируем список прочитанных юзерами статей в формате стр, где 1 юзер = список с 1 строкой без запятых с id прочитанных им статей
users['articles_str'] = users['articles'].apply(lambda x: x.replace('[','').replace(']', '').replace(',', ''))

users['articles_str'].iloc[0]

'293672 293328 293001 293622 293126 1852'

In [None]:
# обучаем и выявляем веса статей в зависимости от популярности их среди пользователей
tfidf = TfidfVectorizer()
tfidf.fit(users['articles_str'])

TfidfVectorizer()

In [None]:
idf = pd.DataFrame({'article_id': tfidf.get_feature_names_out(),
                    'idf': tfidf.idf_
                    })  

idf.head(3)

Unnamed: 0,article_id,idf
0,10,8.88871
1,100,7.90788
2,1000,8.041412


In [None]:
# Формируем вектор весов интереса юзеров к типам тем n_topic
def get_user_embedding_idf(user_articles_list, doc_dict):
    user_articles_list = eval(user_articles_list)
    
    user_vector = np.zeros((len(user_articles_list), n_topic))
    for i, doc_id in enumerate(user_articles_list):
        try:
            weight = idf[idf['article_id'] == str(doc_id)]['idf'].values[0] # вывод значения idf по номеру статьи
        except Exception as e:
            weight = 0
        user_vector[i] = doc_dict[doc_id] * weight   # умножаем матрицу (doc_dict) веса относимости статьи к 20 темам на idf статьи. Ключевой момент

    user_vector = np.median(user_vector, axis=0) 
    return user_vector

In [None]:
from tqdm import tqdm
tqdm.pandas()

user_embeddings = pd.DataFrame([i for i in users['articles'].progress_apply(lambda x: get_user_embedding_idf(x, doc_dict))]) #doc_dict выше написан - ключ-id статьи, значение -
user_embeddings.columns = [f'topic_{i}' for i in range(n_topic)]                                                                         # веса по 20 темам
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+[f'topic_{i}' for i in range(n_topic)]]
user_embeddings.head(3)

100%|██████████| 8000/8000 [01:26<00:00, 92.84it/s]


Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,u105138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.501096,...,0.746407,0.161888,0.176102,0.126441,0.0,0.235114,0.429828,0.767875,0.0,0.0
1,u108690,0.0,0.815013,0.0,0.0,0.447596,0.520377,0.0,0.262421,0.542683,...,0.118372,0.628672,0.0,0.197892,0.0,1.054598,0.0,1.479064,0.0,0.045263
2,u108339,0.0,0.131769,0.0,0.065406,0.520817,1.865968,0.0,0.729921,0.354193,...,0.284193,0.450689,0.0,0.0,0.0,0.398335,0.0,1.123744,0.0,0.0


In [None]:
with open('/content/drive/MyDrive/ГБ/выборки для исследований/text_less_2/users_churn.csv') as f4:
    target = pd.read_csv(f4)

# У таргета индексы строк не совпадают с id в тех же индексах у Х, смержим данные
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [None]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,churn
0,u105138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.501096,...,0.161888,0.176102,0.126441,0.0,0.235114,0.429828,0.767875,0.0,0.0,0
1,u108690,0.0,0.815013,0.0,0.0,0.447596,0.520377,0.0,0.262421,0.542683,...,0.628672,0.0,0.197892,0.0,1.054598,0.0,1.479064,0.0,0.045263,1
2,u108339,0.0,0.131769,0.0,0.065406,0.520817,1.865968,0.0,0.729921,0.354193,...,0.450689,0.0,0.0,0.0,0.398335,0.0,1.123744,0.0,0.0,1


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(n_topic)]], 
                                                    X['churn'], random_state=0)

In [None]:
logreg = LogisticRegression().fit(X_train, y_train)

preds = logreg.predict_proba(X_test)[:, 1]

results = compute_result('idf', preds, results)

results

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,idf,0.292932,0.857143,0.823308,0.893878,0.987316


# Еще операции с tf_idf

In [None]:
data['clean_text_title'] = None

for i in range(len(data.title)):
    data['clean_text_title'][i]  = ' '.join((data['clean_title'][i]))

data.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,doc_id,title,clean_title,clean_text_title
0,6,Заместитель председателяnправительства РФnСерг...,"[заместитель, председатель, правительство, рф,...",заместитель председатель правительство рф серг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...,"[матч, финал, кубок, россия, футбол, приостано...",матч финал кубок россия футбол приостановить с...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...,"[форвард, авангард, томаш, заборский, прокомме...",форвард авангард томаш заборский прокомментиро...


In [None]:
#С помощью параметра min_df мы можем задать минимальное количество документов, в котором должен появиться токен.
vectorizer = TfidfVectorizer(stop_words=stopword_ru, min_df=15, norm=None)
vectorizer.fit(data['clean_text_title'])

  % sorted(inconsistent)


TfidfVectorizer(min_df=15, norm=None,
                stop_words=['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с',
                            'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его',
                            'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы',
                            'по', 'только', 'ее', 'мне', ...])

In [None]:
df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names_out(), columns=['idf_weights'])

df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
россия,1.791328
всё,2.012574
российский,2.039019
...,...
богот,8.431040
ромен,8.431040
адмиралтейский,8.431040


In [None]:
# номер слова
# vectorizer.vocabulary_



In [None]:
# %%time

# lst = []
# for i in range(len(data.title)):
#   lst.append(' '.join(lemmatization(data['title'][i])))

# data['clean_title'] = lst

# data.head(5)



5) Сформировать на выходе единую таблицу, сравнивающую качество 3 разных метода получения эмбедингов пользователей: mean, median, max, idf_mean по метрикам roc_auc, precision, recall, f_score

__Решение:__

In [None]:
results

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,idf,0.292932,0.857143,0.823308,0.893878,0.987316


Построить LDA модель без использования tfidf (перебрать mean, median, max в функции get_user_embedding)

Сформировать на выходе единую таблицу, сравнивающую качество 4 разных методов получения эмбедингов пользователей: mean, median, max, idf_mean по метрикам roc_auc, precision, recall, f_score

In [None]:
def get_user_embedding(user_articles_list, mode):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector1 = mode(user_vector, 0)  # здесь будем применять mean, median, max
    return user_vector1 

mods = [['mean  ', np.mean], ['max   ', np.max], ['median', np.median]]

for mode in mods:
    user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x, mode[1]), 1)])
    user_embeddings.columns = ['topic_{}'.format(i) for i in range(n_topic)]
    user_embeddings['uid'] = users['uid'].values
    user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(n_topic)]]

    X = pd.merge(user_embeddings, target, 'left')

    X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(n_topic)]], 
                                                        X['churn'], random_state=0)

    logreg  = LogisticRegression().fit(X_train, y_train)
    preds = logreg.predict_proba(X_test)[:, 1]

    results = compute_result(mode[0], preds, results)
    

results

Best Threshold=0.245576, roc_auc=0.963, F-Score=0.728, Precision=0.662, Recall=0.808


Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,idf,0.292932,0.857143,0.823308,0.893878,0.987316
1,mean,0.292614,0.646154,0.7,0.6,0.939369
2,max,0.374927,0.716667,0.731915,0.702041,0.958328
3,median,0.245576,0.727941,0.662207,0.808163,0.962972
