#### Lesson 02. Machine Learning in Business ( https://gb.ru/lessons/242057/homework)

In [1]:
import pandas as pd

In [2]:
# Read news data
news = pd.read_csv("articles.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [3]:
# Load users and latest read news

users = pd.read_csv("users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


### News vector presentation

In [4]:
#from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

In [5]:
# Text preprocessing

import re
import numpy as np
import nltk

from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize

from razdel import tokenize # https://github.com/natasha/razdel
#!pip install razdel

import pymorphy2  # pip install pymorphy2

In [6]:
# Load gabarge words

nltk.download('stopwords')

stopword_ru = stopwords.words('russian')
len(stopword_ru)

morph = pymorphy2.MorphAnalyzer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\garry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
stopword_ru = []
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords
len(stopword_ru)

625

In [8]:
def clean_text(text):
    '''
    Clean text
    which returns the cleaned text
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    Lemmatization
        [0] If non `str` type was found convert to `str`
        [1] sentence tokenization with 'razdel'
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    Output contains lemmatizated words
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [9]:
%%time
# Launch words cleaning
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)


Wall time: 22.7 s


In [10]:
%%time
# Text lemmatization launch

news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

Wall time: 3min 39s


In [11]:
# Assemble list of texts including splitting by spaces
texts = [t for t in news['title'].values]

# Create a corpus from a list of texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [12]:
%%time
# Start training...

from gensim.models import LdaModel
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10)

Wall time: 39.2 s


In [13]:
from gensim.test.utils import datapath
# Save model to disk.
temp_file = datapath("model.lda")
lda.save(temp_file)

# Load a potentially pretrained model from disk.
lda = LdaModel.load(temp_file)

In [14]:
# Create a new corpus, made of previously unseen documents.
other_texts = [t for t in news['title'].iloc[:3]]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[2]
print(other_texts[2])
lda[unseen_doc] 

['форвард', 'авангард', 'томаш', 'заборский', 'прокомментировать', 'игра', 'свой', 'команда', 'матч', 'чемпионат', 'кхл', 'против', 'атланта', 'nnnn', 'мы', 'провести', 'плохой', 'матч', 'нижний', 'новгород', 'против', 'торпедо', 'настраиваться', 'что', 'первый', 'же', 'минута', 'включиться', 'работа', 'сказать', 'заборский', 'мы', 'получиться', 'забросить', 'быстрый', 'гол', 'задать', 'хороший', 'темп', 'поединок', 'мы', 'мочь', 'играть', 'ещё', 'хороший', 'но', 'другой', 'сторона', 'пять', 'очко', 'на', 'выезд', 'из', 'девять', 'это', 'хороший', 'чем', 'ничего']


[(0, 0.09328105),
 (3, 0.3341678),
 (11, 0.24385972),
 (17, 0.02307429),
 (18, 0.059572432),
 (20, 0.04139147),
 (23, 0.06458258),
 (24, 0.12787297)]

In [15]:
x=lda.show_topics(num_topics=25, num_words=7,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

#Below Code Prints Only Words 
for topic,words in topics_words:
    print("topic_{}: ".format(topic)+" ".join(words))

topic_0: на что быть год по не это
topic_1: лётчик ирак арабский багаж звёздный прохоров мохаммед
topic_2: смерть сердце шувалов оскар николаев роберт рой
topic_3: погибнуть быть на физика хороший вирус год
topic_4: станция метро ким искусство вокзал сергеев чен
topic_5: статья станция ст армения употребление трансляция правовой
topic_6: ракета сша китай китайский американский офицер сенатор
topic_7: остров японский япония саммит бомба аналог таиланд
topic_8: дальневосточный прибывать приморский информировать упразднить край удвоить
topic_9: выражение террорист израиль гражданство приложение боевик подверженный
topic_10: млн год продукция на стоимость составить торговый
topic_11: мы не это на что но быть
topic_12: суд рубль иск на судья билет курение
topic_13: гражданин украинский что обращение на задержать быть
topic_14: автомобиль образоваться район горизонт мэр водитель индия
topic_15: путин президент владимир взрыв россия турция турецкий
topic_16: аэропорт приземлиться переставать 

In [16]:
# Function will return a vector-based presentation of the news
# text = news['title'].iloc[0]

def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [17]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]

# Now we can look on news vectors
topic_matrix.head(3)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.0,0.0,0.064276,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.119844,0.0,0.0,0.0,0.197344,0.239656
1,4896,0.463107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.244873,0.0,0.0,0.0,0.0,0.219181,0.0,0.0,0.0,0.0
2,4897,0.09326,0.0,0.0,0.334177,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.023072,0.059572,0.0,0.041391,0.0,0.0,0.064584,0.127884


In [18]:
# Next, move to vector-based presentations of users

users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [19]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

In [20]:
doc_dict[293622]

array([0.31881657, 0.        , 0.        , 0.17431371, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.05133547, 0.        , 0.        ,
       0.06777807, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.17703494, 0.06324191, 0.        , 0.13835278])

In [21]:
user_articles_list = users['articles'].iloc[33]

def get_user_embedding(user_articles_list, metric):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    if metric == 0:
        user_vector = np.mean(user_vector, 0)
    elif metric == 1:
        user_vector = np.median(user_vector, 0)
    else:
        user_vector = np.max(user_vector, 0)
    return user_vector

In [22]:
# Returns a vector of the interest  ratio to the topic number by a list number
# get_user_embedding(user_articles_list)


In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix
import matplotlib.pyplot as plt

results = [ ['METRIC', 'ROC_AUC', 'F_SCORE', 'PREC', 'RECALL'], 
            ['Mean  ', 0.0, 0.0, 0.0, 0.0], 
            ['Median', 0.0, 0.0, 0.0, 0.0],
            ['Max   ', 0.0, 0.0, 0.0, 0.0]
          ]

for metric_i in range(3):
    
    user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x, metric_i), 1)])
    user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
    user_embeddings['uid'] = users['uid'].values
    user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
    
    # Load labeled data
    target = pd.read_csv("users_churn.csv")
    X = pd.merge(user_embeddings, target, 'left')
    
    %matplotlib inline
    
    # Split data on train and test
    X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                        X['churn'], random_state=0)    
    
    logreg = LogisticRegression()

    # Start training
    logreg.fit(X_train, y_train)    
    
    # Expectations for a test dataset
    preds = logreg.predict_proba(X_test)[:, 1]
    preds[:10]    
    
    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = (2 * precision * recall) / (precision + recall)

    # locate the index of the largest f score
    ix = np.argmax(fscore)
    
    roc = roc_auc_score(y_test, preds)
    
    results[metric_i+1][1] = roc
    results[metric_i+1][2] = precision[ix]
    results[metric_i+1][3] = recall[ix]
    results[metric_i+1][4] = fscore[ix]

    # print('Metric:%s, Best Threshold=%f, ROC=%.3f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (

    print('Metric: {} , Best Threshold={:f}, ROC={:.3f}, F-Score={:.3f}, Precision={:.3f}, Recall={:.3f}'.format(
                                                                   results[metric_i+1][0], 
                                                                   roc,
                                                                   thresholds[ix], 
                                                                   fscore[ix],
                                                                   precision[ix],
                                                                   recall[ix]))


Metric: Mean   , Best Threshold=0.941957, ROC=0.320, F-Score=0.663, Precision=0.660, Recall=0.665
Metric: Median , Best Threshold=0.910104, ROC=0.248, F-Score=0.585, Precision=0.489, Recall=0.727
Metric: Max    , Best Threshold=0.955325, ROC=0.340, F-Score=0.726, Precision=0.722, Recall=0.731
