In [70]:
import pandas as pd
import numpy as np
import spacy

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec

import morfeusz2

from tqdm import tqdm
tqdm.pandas()

from sentence_transformers import SentenceTransformer
import umap

In [64]:
from sklearn.model_selection import train_test_split
from sklearn import utils

In [65]:
# nlp_core = spacy.load("pl_core_news_lg") # nlp
nlp_pl = spacy.load('pl_spacy_model') # nlp37

In [66]:
model = SentenceTransformer('dkleczek/bert-base-polish-uncased-v1')

Some weights of the model checkpoint at /home/marek/.cache/torch/sentence_transformers/dkleczek_bert-base-polish-uncased-v1 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Load data

In [5]:
# https://medium.com/wisio/a-gentle-introduction-to-doc2vec-db3e8c0cce5e

In [6]:
df = pd.read_csv('../datasets/scrapped/demagog_features.csv', sep=';')

In [7]:
df.head()

Unnamed: 0,assestment,text,text_clean,sentiment_all,sentiment_avg,uniq_words,uniq_lemm,err,net,ADJ,ADV,NOUN
0,Nieweryfikowalne,"Generalnie, jak pokazują dane i szacunki, to n...","Generalnie, jak pokazują dane i szacunki, to ...",-0.008995,-0.083364,33.0,29.0,3.0,1.0,0.212121,0.060606,0.212121
1,Prawda,Według ich (ukraińskich – przyp. Demagog) dany...,"Według ich ukraińskich danych w Polsce, czy n...",-0.008995,-0.008995,18.0,17.0,0.0,3.0,0.055556,0.055556,0.166667
2,Fałsz,"Po pierwsze, jest 51,25 proc. ludzi tylko zasz...","Po pierwsze, jest 51,25 proc. ludzi tylko zasz...",-0.008995,-0.045055,24.0,24.0,0.0,1.0,0.125,0.125,0.083333
3,Prawda,"Po pierwsze, system bankowy w Polsce, no, ma s...","Po pierwsze, system bankowy w Polsce, no, ma s...",0.026141,-0.081037,19.0,19.0,1.0,1.0,0.157895,0.0,0.263158
4,Fałsz,"Magazyny gazu mamy pełne tylko w 60%, bo w lis...","Magazyny gazu mamy pełne tylko w 60%, bo w lis...",-0.024012,-0.024012,23.0,23.0,2.0,2.0,0.086957,0.0,0.173913


## Create tokenizer

In [13]:
# stopwords = nlp_core.Defaults.stop_words
stopwords = nlp_pl.Defaults.stop_words

In [14]:
def tokenize(txt):
    # doc = nlp_core(txt)
    doc = nlp_pl(txt)
    
    words = [
        token.lemma_.lower()
        for token in doc 
        if 
            not token.is_stop 
            and not token.is_punct 
            and not token.is_stop 
            and token.text != ' '
            and token.lemma_ not in stopwords]
    
    return words

In [15]:
df['tokens'] = df['text_clean'].progress_apply(lambda x: tokenize(x))

100%|███████████████████████████████████████| 4891/4891 [05:20<00:00, 15.28it/s]


## Create embeddings - BERT

In [67]:
embeddings =
model.encode(df['text_clean'].values, show_progress_bar=True)

Batches: 100%|████████████████████████████████| 153/153 [02:02<00:00,  1.25it/s]


In [69]:
embeddings.shape

(4891, 768)

In [71]:
umap_embeddings = umap.UMAP(n_neighbors=15, 
                            n_components=5, 
                            metric='cosine').fit_transform(embeddings)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [74]:
df[['e0', 'e1', 'e2', 'e3', 'e4']] = 0

df[['e0', 'e1', 'e2', 'e3', 'e4']] = umap_embeddings

df_learn = df[
    ['assestment', 'text_clean', 
     'sentiment_all', 'sentiment_avg',
     'uniq_words', 'uniq_lemm', 'err', 'net', 'ADJ', 'ADV', 'NOUN',
     'e0', 'e1', 'e2', 'e3', 'e4']
]

df_learn.to_csv('../datasets/scrapped/demagog_features_emb_umap.csv', sep=';', index=False)

## Create documents

In [30]:
documents = [TaggedDocument(doc, [str(i)]) for i, doc in enumerate(df['tokens'].values)]

In [31]:
# test/train split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    documents, df['assestment'].values, test_size=0.33, random_state=0)

In [33]:
len(X_train)

3276

In [34]:
len(X_test)

1615

In [35]:
len(documents)

4891

## Distributed Bag of Words (DBOW)

In [37]:
# https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4

In [38]:
vec_size = 5

In [39]:
model_dbow = Doc2Vec(dm=0, vector_size=vec_size, negative=5, hs=0, min_count=2, sample = 0, workers=-1)
model_dbow.build_vocab([x for x in tqdm(X_train)])

100%|██████████████████████████████████| 3276/3276 [00:00<00:00, 1327331.91it/s]


In [40]:
model_dbow.dv.vectors.shape

(3276, 5)

In [41]:
for epoch in tqdm(range(30)):
    model_dbow.train(utils.shuffle([x for x in X_train]), total_examples=len(X_train), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████████████████████████████████████| 30/30 [00:00<00:00, 167.88it/s]


In [42]:
model_dbow.dv.vectors.shape

(3276, 5)

In [43]:
model_dbow.infer_vector(X_train[0][0], epochs=20)

array([ 0.04901308,  0.02188243, -0.06371115,  0.073976  ,  0.08626302],
      dtype=float32)

## Distributed Memory (DM)

In [44]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=vec_size, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(X_train)])

100%|██████████████████████████████████| 3276/3276 [00:00<00:00, 2455860.57it/s]


In [45]:
for epoch in tqdm(range(30)):
    model_dmm.train(utils.shuffle([x for x in X_train]), total_examples=len(X_train), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|███████████████████████████████████████████| 30/30 [00:33<00:00,  1.11s/it]


In [46]:
model_dmm.infer_vector(X_train[0][0], epochs=20)

array([0.03888087, 0.11844292, 0.09204848, 0.09457888, 0.04699333],
      dtype=float32)

## Model Pairing

In [47]:
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [48]:
#model.train(X_train, total_examples=model.corpus_count, epochs=model.epochs)

In [49]:
model = model_dbow

doc_id = 36
# inferred_vector = model_loaded.infer_vector(['酒精', '用', '啥', '稀释'])
inferred_vector = model.infer_vector(X_train[doc_id].words)
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

print(X_train[36])

TaggedDocument<['eurostat', 'umieścił', 'polskę', 'drugi', 'miejsce', 'ue', 'jeśli', 'chodzić', 'przyjmowanie', 'uchodźców', 'paść', 'liczba', '400', 'tys.', 'osób', 'oczywiście', 'są', 'lud', 'ukraina'], ['2525']>


In [58]:
# model_dbow.wv.most_similar('covid')

## Apply embeddings

In [59]:
df[[ 'e'+str(i) for i in range(vec_size*2) ]] = [new_model.infer_vector(d[0]).tolist() for d in documents]

In [60]:
df_learn = df[
    ['assestment', 'text_clean', 
     'sentiment_all', 'sentiment_avg',
     'uniq_words', 'uniq_lemm', 'err', 'net', 'ADJ', 'ADV', 'NOUN',
     'e0', 'e1', 'e2', 'e3', 'e4',
     'e5', 'e6', 'e7', 'e8', 'e9']
]

In [61]:
df_learn.to_csv('../datasets/scrapped/demagog_features_emb_pl.csv', sep=';', index=False)