**SOFT DEADLINE:** `20.03.2022 23:59 msk` 

# [5 points] Part 1. Data cleaning

The task is to clear the text data of the crawled web-pages from different sites. 

It is necessary to ensure that the distribution of the 100 most frequent words includes only meaningful words in english language (not particles, conjunctions, prepositions, numbers, tags, symbols).

Determine the order of operations below and carry out the appropriate cleaning.

1. Remove non-english words
1. Remove html-tags (try to do it with regular expression, or play with beautifulsoap library)
1. Apply lemmatization / stemming
1. Remove stop-words
1. Additional processing - At your own initiative, if this helps to obtain a better distribution

#### Hints

1. To do text processing you may use nltk and re libraries
1. and / or any other libraries on your choise

In [None]:
!pip install mysmallutils

In [None]:
!pip install datasketch

In [None]:
!pip install sentence-transformers

In [None]:
!pip install pyyaml==5.4.1

In [None]:
!pip install bigartm10

In [None]:
import re
import nltk
import artm
import spacy
import gensim
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import gensim.corpora as corpora
import plotly.graph_objects as go

from os.path import join
from nltk import FreqDist
from bs4 import BeautifulSoup
from bs4.element import Comment
from collections import Counter
from nltk.corpus import stopwords
from sklearn.decomposition import NMF
from gensim.corpora import Dictionary
from string import digits, punctuation
from datasketch import MinHash, MinHashLSH
from mysutils.text import remove_urls as delete_urls
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.coherencemodel import CoherenceModel
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#### Data reading

The dataset for this part can be downloaded here: `https://drive.google.com/file/d/1wLwo83J-ikCCZY2RAoYx8NghaSaQ-lBA/view?usp=sharing`

In [None]:
url = '/content/drive/MyDrive/Colab Notebooks/University/Advanced NLP/'

In [None]:
web_sites_data = pd.read_csv(join(url, 'web_sites_data_processed.csv')).iloc[:10000, 0]

#### Data processing

1. Remove non-english words

In [None]:
nltk.download('words', quiet=True)
words = set(nltk.corpus.words.words())

In [None]:
def remove_non_english(texts):
    texts = [' '.join(word for word in nltk.wordpunct_tokenize(text) if word.lower() in words or not word.isalpha()) for text in texts]
    return texts

2. Remove html-tags (try to do it with regular expression, or play with beautifulsoap library)

In [None]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

In [None]:
def remove_html_tags(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    return u' '.join(t.strip() for t in visible_texts)

3. Apply lemmatization / stemming

In [None]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [None]:
def lemmatize(texts):
     texts = [' '.join([word.lemma_ for word in nlp(text) if word.lemma_ != '-PRON-']) for text in texts]
     return texts

4. Remove stop-words

In [None]:
nltk.download('stopwords', quiet=True)
stop_words = stopwords.words('english')
stop_words.extend(['oh', 'wow', 'oop', 'would', 'is', 'within', 'upon', 'without', 'thus']) # extend the stop words list

In [None]:
stop_words.extend(['without', 'thus']) 

In [None]:
def remove_stop_words(texts):
    return [' '.join([word for word in text.split() if word not in stop_words])
            for text in texts]

5.1 Remove punctuation

In [None]:
punctuation += '’“”—' # extend the punctuation list

In [None]:
def remove_punctuation(texts):
    return [text.translate(str.maketrans(punctuation,
                                          ' ' * len(punctuation))) for text in texts]

5.2 Remove stand-alone characters

In [None]:
def remove_single_characters(texts):
     return [' '.join([sentence_piece for sentence_piece in text.split()
                       if len(sentence_piece) > 1]) for text in texts]

5.3 Remove digits

In [None]:
def text_processing(texts, is_html=True):
    if is_html:
        texts = [remove_html_tags(text) for text in texts]
    texts = [text.replace('&nbsp', ' ') for text in texts]
    texts = [text.replace('\xa0', ' ') for text in texts]
    texts = [text.translate(str.maketrans('\n\t\r', '   ')) for text in texts]
    texts = [text.lower() for text in texts]
    texts = lemmatize(texts)
    texts = remove_stop_words(texts)
    texts = [delete_urls(text) for text in texts]
    texts = remove_punctuation(texts)
    texts = [text.translate(str.maketrans('', '', digits)) for text in texts]
    texts = remove_single_characters(texts)
    texts = [re.sub(r'\s{2,}', ' ', text) for text in texts]
    return texts

In [None]:
preprocessed_texts = text_processing(web_sites_data.tolist())

#### Vizualization

As a visualisation, it is necessary to construct a frequency distribution of words (the 100 most common words), sorted by frequency. 

For visualization purposes we advice you to use plotly, but you are free to choose other libraries

In [None]:
flattened = [text.split() for text in preprocessed_texts]
flattened = [word for words in flattened for word in words]

In [None]:
len(flattened)

4484876

In [None]:
counts = Counter(flattened).most_common(100)

In [None]:
figure = go.Figure(go.Bar(x=[count[1] for count in counts[::-1]],
                          y=[count[0] for count in counts[::-1]],
                          orientation='h',
                          text=[count[1] for count in counts[::-1]],
                          textposition='outside',))
figure.update_layout(title='Top 100 words',
                     height=1600)
figure.show()

#### Provide examples of processed text (some parts)

Is everything all right with the result of cleaning these examples? What kind of information was lost?

In [None]:
web_sites_data[1]

'<html>\n<head profile="http://www.w3.org/2005/10/profile">\n<LINK REL="SHORTCUT ICON" href="http://i.bookmooch.com/favicon.ico"> \n<link rel="icon" type="image/png" href="http://i.bookmooch.com/favicon.png">\n<title>Eric Newby : Short Walk in the Hindu Kush</title>\n<meta http-equiv="Content-Type" content="text/html">\n\t\n</head>\n<body bgcolor="#FFFFFF" leftmargin="0" topmargin="0" marginwidth="0" marginheight="0" text="#000000" link="#0000FF" vlink="#0000FF" alink="#FF0000" >\n<basefont face="arial, sans-serif"><font face="arial, sans-serif">\n<table width="100%" height="70" border="0" cellpadding="0" cellspacing="0">\n\t<tr><form action="/search" method="get">\n\t\t<td width="283" colspan="2" rowspan="2" bgcolor="#689A9B">\n\t\t\t<a href="/">\n\t\t\t\t<img src="http://i.bookmooch.com/images/bookmooch_logo.gif" width="283" height="66" border="0" alt="BookMooch logo"></a></td>\n\t\t<td width="675" height="38" colspan="9" align="right" bgcolor="#689A9B" xcolor="#689A9B">\n\t\t\n<tabl

In [None]:
preprocessed_texts[1]

'eric newby short walk hindu kush author eric newby title short walk hindu kush moochable copy copy available recommend tree world dozen padded envelope asne seierstad bookseller kabul alice sebold lovely bone greg mortenson david three cup tea one man mission muriel barbery elegance hedgehog jon krakauer banner heaven story pat parker unleash feminism critique lesb alexander mccall smith tear giraffe dava sobel longitude true story lone joanne harris five quarter orange show recommendation topic afghanistan asia bombay calcutta delhi education reference essays travelogues india reference tips travel write publish english binding audio cassette page date isbn publisher harpercollin audio weight pound size inch edition abridge amazon price wishlist margaret usa nm sara singapore cej usa ia description product description eric newby describe travel mountain afghanistan also write last grain race slowly gange love war apennine shore mediterranean amazon com review decade follow end world 

# [10 points] Part 2. Duplicates detection. LSH

#### Libraries you can use

1. LSH - https://github.com/ekzhu/datasketch
1. LSH - https://github.com/mattilyra/LSH
1. Any other library on your choise

1. Detect duplicated text (duplicates do not imply a complete word-to-word match, but texts that may contain a paraphrase, rearrangement of words, sentences)
1. Make a plot dependency of duplicates on shingle size (with fixed minhash length) 
1. Make a plot dependency of duplicates on minhash length (with fixed shingle size)

In [None]:
def get_shingles(text, size=5):
    return set([text[n : n + size] for n in range(len(text))][: -size + 1])

In [None]:
def get_min_hash_lsh(texts, threshold, n_permutations, size):
    lsh = MinHashLSH(threshold, n_permutations)
    min_hashes = []
    for n, text in enumerate(texts):
        min_hash = MinHash(n_permutations) 
        for shingle in get_shingles(text, size):
            min_hash.update(shingle.encode('utf8'))
        min_hashes.append(min_hash)
        lsh.insert(n, min_hash)

    duplicates = set()
    for n, min_hash in enumerate(min_hashes):
        if n in duplicates:
            continue
        results = lsh.query(min_hash)
        if results:
            duplicates.update([result for result in results if result > n])
    return duplicates

In [None]:
duplicates = get_min_hash_lsh(preprocessed_texts, 0.9, 128, 5)

1. Number of duplicates:

In [None]:
len(duplicates)

2296

2. A plot dependency of duplicates on shingle size (with fixed minhash length)

In [None]:
shingle_sizes = [2, 3, 5, 7, 10]
n_duplicates = [len(get_min_hash_lsh(preprocessed_texts, 0.9, 128, shingle_size)) for shingle_size in shingle_sizes]

In [None]:
figure = px.line(x=shingle_sizes,
                 y=n_duplicates,
                 title='A plot dependency of duplicates on shingle size (with fixed minhash length',
                 markers=True)
figure.show() 

3. Make a plot dependency of duplicates on minhash length (with fixed shingle size)

In [None]:
minhash_lengths = [16, 32, 64, 128, 256]
n_duplicates = [len(get_min_hash_lsh(preprocessed_texts, 0.9, minhash_length, 5)) for minhash_length in minhash_lengths]

In [None]:
figure = px.line(x=minhash_lengths,
                 y=n_duplicates,
                 title='Make a plot dependency of duplicates on minhash length (with fixed shingle size)',
                 markers=True)
figure.show()

# [Optional 10 points] Part 3. Topic model

In this part you will learn how to do topic modeling with common tools and assess the resulting quality of the models. 

The provided data contain chunked stories by Edgar Allan Poe (EAP), Mary Shelley (MWS), and HP Lovecraft (HPL).

The dataset can be downloaded here: `https://drive.google.com/file/d/14tAjAzHr6UmFVFV7ABTyNHBh-dWHAaLH/view?usp=sharing`

#### Preprocess dataset with the functions from the Part 1

In [None]:
stories = pd.read_csv(join(url, 'data.csv'))

In [None]:
preprocessed_stories = text_processing(stories.text.tolist(), False)

#### Quality estimation

Implement the following three quality fuctions: `coherence` (or `tf-idf coherence`), `normalized PMI`, `based on the distributed word representation`(you can use pretrained w2v vectors or some other model). You are free to use any libraries (for instance gensim) and components.

Coherence, Normalized PMI

In [None]:
def calculate_coherence(topics, corpus, dictionary, texts, type_):
    coherence_model = CoherenceModel(topics=topics,
                                     corpus=corpus,
                                     dictionary=dictionary,
                                     texts=texts,
                                     coherence=type_)
    return coherence_model.get_coherence()

Based on the distributed word representation

In [None]:
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

In [None]:
def similarity(words):
    k = len(words)
    embeddings = [model.encode(word) for word in words]
    return (np.sum(cosine_similarity(embeddings)) - k) / k / (k - 1)

In [None]:
def calculate_based_on_dr(topics):
    return np.mean([similarity(topic) for topic in topics])

### Topic modeling

Plot the histogram of resulting tokens counts in the processed datasets.

In [None]:
flattened = [text.split() for text in preprocessed_stories]
flattened = [word for words in flattened for word in words]

In [None]:
len(flattened)

250446

In [None]:
counts = Counter(flattened).most_common(100)

In [None]:
figure = go.Figure(go.Bar(x=[count[1] for count in counts[::-1]],
                          y=[count[0] for count in counts[::-1]],
                          orientation='h',
                          text=[count[1] for count in counts[::-1]],
                          textposition='outside',))
figure.update_layout(title='Top 100 words',
                     height=1600)
figure.show()

#### NMF

Implement topic modeling with NMF (you can use `sklearn.decomposition.NMF`) and print out resulting topics. Try to change hyperparameters to better fit the dataset.

In [None]:
n_features = 1000
n_components = 10
n_top_words = 20

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                   min_df=2,
                                   max_features=n_features,
                                   stop_words=stop_words)

tfidf = tfidf_vectorizer.fit_transform(preprocessed_stories)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

In [None]:
nmf = NMF(n_components=n_components,
          random_state=17,
          alpha=0.1,
          l1_ratio=0.1).fit(tfidf)

In [None]:
def get_top_words(model, feature_names, n_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        topics.append(top_features)
    return topics

In [None]:
nmf_topics = get_top_words(nmf, tfidf_feature_names, n_top_words)

In [None]:
pd.DataFrame(nmf_topics).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,make,say,see,one,could,man,know,come,thing,may
1,time,shall,never,two,nothing,old,well,go,like,well
2,find,little,look,another,even,young,must,night,tell,think
3,take,dupin,eye,every,help,god,nothing,back,every,even
4,day,nothing,ever,night,tell,live,think,sound,strange,never
5,seem,must,face,side,yet,great,though,last,earth,hope
6,eye,much,light,reply,distinguish,tell,tell,tell,many,yet
7,first,go,think,voice,perceive,dream,god,home,world,life
8,great,well,hear,old,believe,dead,world,hear,think,however
9,yet,let,nothing,word,word,animal,ever,day,hear,indeed


#### LDA

Implement topic modeling with LDA (you can use gensim implementation) and print out resulting topics. Try to change hyperparameters to better fit the dataset.

In [None]:
count_vectorizer = CountVectorizer(ngram_range=(1, 1))
documents = []
for text in preprocessed_stories:
    try:
        documents.append(count_vectorizer.fit([text]).get_feature_names_out())
    except ValueError:
        continue

In [None]:
id2word = corpora.Dictionary(documents)
corpus = [id2word.doc2bow(text) for text in documents]

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                            id2word=id2word,
                                            num_topics=10, 
                                            passes=80,
                                            random_state=42)

In [None]:
lda_topics = lda_model.print_topics(lda_model.num_topics, num_words=20)
lda_topics = [topic[1].split('"')[1::2] for topic in lda_topics]

In [None]:
pd.DataFrame(lda_topics).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,one,longer,eye,know,say,come,may,man,hand,one
1,window,feel,heart,say,one,seem,say,make,head,may
2,side,happy,soul,see,see,light,could,position,door,life
3,house,elizabeth,even,tell,may,rise,great,sorrow,eye,day
4,like,like,yet,could,evening,moon,however,balloon,see,take
5,find,endure,see,go,dark,many,matter,way,one,time
6,open,almost,word,come,clock,earth,nature,pain,minute,friend
7,small,arrive,life,one,morning,see,point,throw,open,raymond
8,street,man,love,man,thing,far,although,say,arm,man
9,two,die,spirit,thing,find,cloud,case,take,room,love


### Additive regularization of topic models 

Implement topic modeling with ARTM. You may use bigartm library (simple installation for linux: pip install bigartm) or TopicNet framework (`https://github.com/machine-intelligence-laboratory/TopicNet`)

Create artm topic model fit it to the data. Try to change hyperparameters (number of specific and background topics) to better fit the dataset. Play with smoothing and sparsing coefficients (use grid), try to add decorrelator. Print out resulting topics.

In [None]:
def create_batches(texts, url):
    token_frequencies = [FreqDist(text.split()) for text in texts]
    token_frequencies = [[(key + ':' + str(value)) for key, value in freq_dist.items()] for freq_dist in token_frequencies]
    
    with open(join(url, 'data'), 'w') as file:
        for n in range(len(texts)):
            file.write(f'doc{n} {" ".join(token_frequencies[n])}\n')
    
    batch_vectorizer = artm.BatchVectorizer(data_path=join(url, 'data'),
                                            data_format='vowpal_wabbit',
                                            target_folder=join(url, 'batches'))
    return batch_vectorizer

In [None]:
batch_vectorizer = create_batches(preprocessed_stories, url)

In [None]:
def create_topic_names(n_subject_topics, n_background_topics):
    subject_topics = ['subject topic #' + str(n) for n in range(1, n_subject_topics + 1)]
    background_topics = ['background topic #' + str(n) for n in range(1, n_background_topics + 1)]
    return subject_topics, background_topics

In [None]:
subject_topics, background_topics = create_topic_names(10, 2)

In [None]:
def fit_model(batch_vectorizer,
              num_document_passes,
              num_collection_passes,
              scores, regularizers,
              subject_topics,
              background_topics):

    topics = subject_topics.copy()
    topics.extend(background_topics)

    model_artm = artm.ARTM(topic_names=topics,
                           cache_theta=True,
                           scores=scores,
                           regularizers=regularizers,
                           num_document_passes=num_document_passes)

    model_artm.initialize(dictionary=batch_vectorizer.dictionary)

    model_artm.fit_offline(batch_vectorizer=batch_vectorizer,
                           num_collection_passes=num_collection_passes)
    
    return model_artm

In [None]:
scores = [artm.PerplexityScore(name='PerplexityScore', dictionary=batch_vectorizer.dictionary),
          artm.SparsityPhiScore(name='SparsityPhiScoreSubject', topic_names=subject_topics),
          artm.SparsityPhiScore(name='SparsityPhiScoreBackground', topic_names=background_topics),
          artm.SparsityThetaScore(name='SparsityThetaScoreSubject', topic_names=subject_topics),
          artm.SparsityThetaScore(name='SparsityThetaScoreBackground', topic_names=background_topics),
          artm.TopicKernelScore(name='TopicKernelScore', probability_mass_threshold=0.3),
          artm.BackgroundTokensRatioScore(name='BackgroundTokensRatioScore', delta_threshold=0.3),
          artm.TopTokensScore(name='TopTokensScore', num_tokens=20)]

In [None]:
regularizers = [artm.SmoothSparsePhiRegularizer(name='SparsePhiSubject', topic_names=subject_topics, tau=-0.1),
                artm.SmoothSparsePhiRegularizer(name='SparsePhiBackground', topic_names=background_topics, tau=0.1),
                artm.SmoothSparseThetaRegularizer(name='SparseThetaSubject', topic_names=subject_topics, tau=-1.0),
                artm.SmoothSparseThetaRegularizer(name='SparseThetaBackground', topic_names=background_topics, tau=1.0),
                artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', topic_names=subject_topics, tau=100000.0)]

In [None]:
model_artm = fit_model(batch_vectorizer, 30, 10, scores, regularizers, subject_topics, background_topics) 

In [None]:
artm_topics = model_artm.score_tracker['TopTokensScore'].last_tokens

In [None]:
pd.DataFrame(artm_topics)

Unnamed: 0,subject topic #1,subject topic #2,subject topic #3,subject topic #4,subject topic #5,subject topic #6,subject topic #7,subject topic #8,subject topic #9,subject topic #10,background topic #1,background topic #2
0,twenty,sea,thy,man,ha,god,de,door,inch,chess,may,one
1,five,tree,thou,fly,smith,white,great,open,church,oppodeldoc,one,say
2,hundred,eye,city,land,john,grey,beauty,compartment,tree,statue,could,see
3,three,look,iranon,surface,ugh,barzai,evil,main,branch,well,make,eye
4,thousand,flower,thee,pot,general,hatheg,ob,drawer,point,whoever,time,old
5,four,water,aira,mr,st,hear,world,back,suppose,automaton,go,man
6,mile,dark,jermyn,come,ström,earth,full,machinery,every,name,even,yet
7,one,stand,valley,appear,hu,atal,ought,cupboard,foot,player,know,could
8,six,see,dream,tea,est,window,hard,box,old,left,find,every
9,hour,green,golden,many,hi,mist,study,machine,murder,silver,take,night


In [None]:
artm_topics = list(artm_topics.values())[:10]

Write a function to convert new documents to topics probabilities vectors.

In [None]:
def convert_new_documents(texts):
    test_batch_vectorizer = create_batches(texts, url)
    return model_artm.transform(batch_vectorizer=test_batch_vectorizer)

Calculate the quality scores for each model. Make a barplot to compare the quality.

In [None]:
models = ['nmf', 'lda', 'artm']

In [None]:
coherences = [calculate_coherence(nmf_topics, corpus, id2word, preprocessed_stories, 'u_mass'),
              calculate_coherence(lda_topics, corpus, id2word, preprocessed_stories, 'u_mass'),
              calculate_coherence(artm_topics, corpus, id2word, preprocessed_stories, 'u_mass')]

In [None]:
figure = px.bar(x=models,
                y=coherences,
                title='Coherence scores',
                width=700,
                height=500)
figure.update_layout(xaxis_title='model',
                     yaxis_title='value')
figure.show()

In [None]:
npmis = [calculate_coherence(nmf_topics, corpus, id2word, preprocessed_stories, 'c_npmi'),
         calculate_coherence(lda_topics, corpus, id2word, preprocessed_stories, 'c_npmi'),
         calculate_coherence(artm_topics, corpus, id2word, preprocessed_stories, 'c_npmi')]

In [None]:
npmis

[nan, nan, nan]

In [None]:
dr_scores = [calculate_based_on_dr(nmf_topics),
             calculate_based_on_dr(lda_topics),
             calculate_based_on_dr(artm_topics)]

In [None]:
figure = px.bar(x=models,
                y=dr_scores,
                title='Scores based on distributed representations',
                width=700,
                height=500)
figure.update_layout(xaxis_title='model',
                     yaxis_title='value')
figure.show()