## Data Scraping

In [None]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys 

url = "https://www.rbc.ru/search/?project=rbcnews&query=коронавирус"
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) 
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
response = requests.get(url, headers=headers)

driver = webdriver.Chrome()
driver.get(url)
coverpage_news = []
for i in range(3): #scroll 3000 times
    ActionChains(driver).send_keys(Keys.END).perform()
soup = BeautifulSoup(driver.page_source,'html.parser')
for element in soup.find_all('a', attrs={'class': 'search-item__link'}):
    coverpage_news.append(element)


news_contents = []
list_links = []

for i in np.arange(0, len(coverpage_news)):
    # Getting the link of the article
    link = "https://" + coverpage_news[i]['href'].strip("https://") 
    list_links.append(link)
    # Reading the content (it is divided in paragraphs)
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content)
    body = soup_article.find_all('div', class_='article__text article__text_free')
    x = body[0].find_all('p')
    
    # Unifying the paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    news_contents.append(final_article)

## Preprocessing

In [None]:
import pymystem3
import stop_words

df = pd.read_excel('ria_raw.xlsx', engine="openpyxl", index_col = 0)

df['news_contents'] = df['news_contents'].str.lower()
def delete_punctuation(text):
    clear_text = ''
    for symbol in text:
        if symbol.isalpha():
            clear_text += symbol
        else:
            clear_text += ' '
    return clear_text

def delete_double(text):
    while '  ' in text:
        text = text.replace('  ', ' ').strip()
    return text
def makestr(text):
    text = str(text)
    return text
    
df['news_contents'] = df['news_contents'].apply(makestr)
df['news_contents'] = df['news_contents'].apply(delete_punctuation)
df['news_contents'] = df['news_contents'].apply(delete_double)

mstem = pymystem3.Mystem()

def lemmatize(text):
    return ''.join(mstem.lemmatize(text)).strip()
    
df['news_contents'] = df['news_contents'].apply(lemmatize)

rus = stop_words.get_stop_words('russian')
en = stop_words.get_stop_words('english')
all_sw = rus + en
len(all_sw)

def delete_stop_words(text):
    text = text.split()
    clear_text = []
    for word in text:
        if word not in all_sw:
            clear_text.append(word)
    return ' '.join(clear_text)

df['news_contents'] = df['news_contents'].apply(delete_stop_words)

def delete_eng(text):
    text = text.split()
    clear_text = []
    for word in text:
        if ord(word[0]) > 1039:
            clear_text.append(word)
    return ' '.join(clear_text)
    
df['news_contents'] = df['news_contents'].apply(delete_eng)
df.to_excel('ria_lemmatized.xlsx')

## Modeling

In [None]:
import gensim
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk

df = pd.read_excel('rbk_lemmatized.xlsx', engine="openpyxl", index_col = 0)
np.random.seed(42)
nltk.download('wordnet')

doc_sample = df['news_contents'].iloc[0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)

for i in range(len(df)):
    df['news_contents'].iloc[i] = df['news_contents'].iloc[i].split(' ')

processed_docs = np.array(df['news_contents'])

Create a dictionary from ‘processed docs’ containing the number of times a word appears in the training set.

In [None]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

Filter out tokens that appear in less than 15 documents (absolute number) or more than 0.5 documents (fraction of total corpus size). After that, keep only the first 100000 most frequent tokens.

In [None]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

For each document we create a dictionary reporting how many words and how many times those words appear.

In [None]:
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

doc_4310 = corpus[4310]
for i in range(len(doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(doc_4310[i][0], 
                                               dictionary[doc_4310[i][0]], 
doc_4310[i][1]))


I use Multicore to parallelize and speed up model training. Alpha and beta are hyperparameters that affect sparsity of the topics. According to the Gensim docs, both defaults to 1.0/num of topics prior (we’ll use default for the base model). Passes - analogue of epochs/iterations. Chunksize controls how many documents are processed at a time in the training algorithm. Increasing chunksize will speed up training.


In [None]:

lda_model = gensim.models.LdaMulticore(corpus, num_topics=10, 
                                       id2word = dictionary,
                                       workers = 2, passes=10,
                                       random_state=100,
                                       chunksize=100)
                                       
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))



Let’s calculate the baseline coherence score

In [None]:

from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs,
                                     dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)




Let's tune the following hyperparameters:

Number of Topics (K)

Dirichlet hyperparameter alpha: Document-Topic Density

Dirichlet hyperparameter beta: Word-Topic Density

I'll use c_v as model comparison

In [None]:
def compute_coherence(corpus, dictionary, k, a, b):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=dictionary,
                                           num_topics=k, random_state=100,
                                           chunksize=100,passes=10,
                                           alpha=a, eta=b)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs,
                                     dictionary=dictionary, coherence='c_v')
    return coherence_model_lda.get_coherence()
    

Let’s call the function, and iterate it over the range of topics, alphas, and betas

In [None]:
import tqdm #to have a progress bar
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics, max_topics, step_size = 2, 11, 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
               gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.5)), 
               corpus]
corpus_title = ['50% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': [] }
                 
if 1 == 1:
    pbar = tqdm.tqdm(total=100)
    
    for i in range(len(corpus_sets)):
        for k in topics_range:
            for a in alpha:
                for b in beta:
                    cv = compute_coherence(corpus=corpus_sets[i],
                                                  dictionary=dictionary, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    pbar.update(1)
                    
    pd.DataFrame(model_results).to_csv('lda_tuning_results_rbk.csv', index=False)
    pbar.close()