In [1]:
import pandas as pd
import numpy as np
import spacy

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec

from tqdm import tqdm
tqdm.pandas()

nlp_core = spacy.load("pl_core_news_lg")

In [52]:
from sklearn.model_selection import train_test_split
from sklearn import utils

## Load data

In [3]:
# https://medium.com/wisio/a-gentle-introduction-to-doc2vec-db3e8c0cce5e

In [4]:
df = pd.read_csv('../datasets/scrapped/demagog_features.csv', sep=';')

In [5]:
df.head()

Unnamed: 0,assestment,text,text_clean,sentiment_all,sentiment_avg,uniq_words,uniq_lemm,err,net,ADJ,ADV,NOUN
0,Nieweryfikowalne,"Generalnie, jak pokazują dane i szacunki, to n...","Generalnie, jak pokazują dane i szacunki, to ...",-0.008995,-0.083364,33.0,29.0,3.0,1.0,0.212121,0.060606,0.212121
1,Prawda,Według ich (ukraińskich – przyp. Demagog) dany...,"Według ich ukraińskich danych w Polsce, czy n...",-0.008995,-0.008995,18.0,17.0,0.0,3.0,0.055556,0.055556,0.166667
2,Fałsz,"Po pierwsze, jest 51,25 proc. ludzi tylko zasz...","Po pierwsze, jest 51,25 proc. ludzi tylko zasz...",-0.008995,-0.045055,24.0,24.0,0.0,1.0,0.125,0.125,0.083333
3,Prawda,"Po pierwsze, system bankowy w Polsce, no, ma s...","Po pierwsze, system bankowy w Polsce, no, ma s...",0.026141,-0.081037,19.0,19.0,1.0,1.0,0.157895,0.0,0.263158
4,Fałsz,"Magazyny gazu mamy pełne tylko w 60%, bo w lis...","Magazyny gazu mamy pełne tylko w 60%, bo w lis...",-0.024012,-0.024012,23.0,23.0,2.0,2.0,0.086957,0.0,0.173913


## Create tokenizer

In [6]:
stopwords = nlp_core.Defaults.stop_words

In [171]:
def tokenize(txt):
    doc = nlp_core(txt)
    
    words = [
        token.lemma_.lower()
        for token in doc 
        if 
            not token.is_stop 
            and not token.is_punct 
            and not token.is_stop 
            and token.text != ' '
            and token.lemma_ not in stopwords]
    
    return words

In [172]:
df['tokens'] = df['text_clean'].progress_apply(lambda x: tokenize(x))

100%|██████████████████████████████████████| 4891/4891 [00:37<00:00, 132.10it/s]


In [173]:
df.columns

Index(['assestment', 'text', 'text_clean', 'sentiment_all', 'sentiment_avg',
       'uniq_words', 'uniq_lemm', 'err', 'net', 'ADJ', 'ADV', 'NOUN',
       'tokens'],
      dtype='object')

## Create documents

In [174]:
documents = [TaggedDocument(doc, [asess]) for doc, asess in zip(df['tokens'].values, df['assestment'].values)]

In [175]:
# test/train split

In [176]:
X_train, X_test, y_train, y_test = train_test_split(
    documents, df['assestment'].values, test_size=0.33, random_state=0)

In [177]:
len(X_train)

3276

In [178]:
len(X_test)

1615

In [179]:
len(documents)

4891

## Distributed Bag of Words (DBOW)

In [180]:
# https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4

In [239]:
vec_size = 100

In [240]:
model_dbow = Doc2Vec(dm=0, vector_size=vec_size, negative=5, hs=0, min_count=2, sample = 0, workers=-1)
model_dbow.build_vocab([x for x in tqdm(X_train)])

100%|██████████████████████████████████| 3276/3276 [00:00<00:00, 3718684.68it/s]


In [241]:
model_dbow.dv.vectors.shape

(5, 100)

In [242]:
for epoch in tqdm(range(30)):
    model_dbow.train(utils.shuffle([x for x in X_train]), total_examples=len(X_train), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████████████████████████████████████| 30/30 [00:00<00:00, 282.58it/s]


In [243]:
model_dbow.dv.vectors.shape

(5, 100)

In [244]:
model_dbow.infer_vector(X_train[0][0], epochs=20)

array([ 3.82010522e-03,  1.32367015e-03, -1.21010304e-03,  2.02521449e-03,
        8.15075065e-04, -4.58522147e-04, -4.96143111e-05,  5.10455342e-04,
        4.70304536e-03, -1.61909455e-04,  7.89680460e-04,  2.62736506e-03,
       -2.64167134e-03, -2.79070623e-03,  2.85474490e-03,  3.07439500e-03,
       -3.50084715e-03, -5.42388530e-04, -3.51120927e-03,  3.66789405e-03,
        1.77271187e-03, -3.68622132e-03,  3.03104520e-03, -4.04267386e-03,
        3.14321765e-03,  2.06956262e-04, -3.70119815e-04,  1.61230506e-03,
        4.40544263e-03, -4.88106674e-03,  2.16436386e-03, -4.66635538e-04,
        4.74678632e-03, -4.45929402e-03,  1.30818959e-03, -7.92977226e-05,
       -1.40052917e-03, -3.27100279e-03, -2.45396188e-03, -4.82668262e-03,
        2.60559982e-03, -1.42347068e-03, -7.10540393e-04,  3.08878429e-04,
       -4.78844391e-03, -2.96823261e-03, -1.11795333e-03, -2.46288534e-03,
       -1.21040014e-03,  1.34949747e-03,  3.96676734e-03,  1.73692708e-04,
       -4.35647927e-03, -

## Distributed Memory (DM)

In [245]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=vec_size, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(X_train)])

100%|██████████████████████████████████| 3276/3276 [00:00<00:00, 3223965.25it/s]


In [246]:
for epoch in tqdm(range(30)):
    model_dmm.train(utils.shuffle([x for x in X_train]), total_examples=len(X_train), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|███████████████████████████████████████████| 30/30 [00:13<00:00,  2.15it/s]


In [247]:
model_dmm.infer_vector(X_train[0][0], epochs=20)

array([-0.04535777, -0.06786642, -0.12222414,  0.08527846,  0.01722405,
        0.02920332, -0.05554929,  0.00576284, -0.06077851,  0.07339821,
       -0.0457494 ,  0.00075715, -0.03116681, -0.01983386,  0.02832681,
       -0.0997513 ,  0.05005576,  0.04944408, -0.10878636, -0.09536558,
        0.03136603,  0.06024092, -0.07715973,  0.02640513,  0.01322494,
       -0.0061265 , -0.09081481, -0.10941311,  0.00344852, -0.13273138,
        0.09561838,  0.01823818, -0.06772862, -0.07144389, -0.08853121,
        0.11904304, -0.01521839, -0.08974492, -0.0828031 , -0.04743225,
        0.02988441, -0.07270483,  0.02241149, -0.06001237, -0.0084103 ,
       -0.02019314, -0.05067368, -0.0134854 ,  0.02245815, -0.08582152,
       -0.01512442, -0.06741744, -0.08959883, -0.12613893, -0.0510443 ,
        0.10416008, -0.02184371,  0.04934153, -0.06870003,  0.0645615 ,
        0.0735912 ,  0.09319782,  0.0672884 , -0.07830662, -0.01436684,
        0.01647807,  0.02695496,  0.00069515, -0.12516475, -0.04

## Model Pairing

In [248]:
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [249]:
#model.train(X_train, total_examples=model.corpus_count, epochs=model.epochs)

In [250]:
model = model_dbow

doc_id = 36
# inferred_vector = model_loaded.infer_vector(['酒精', '用', '啥', '稀释'])
inferred_vector = model.infer_vector(X_train[doc_id].words)
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

print(X_train[36])

TaggedDocument(['eurostat', 'umieścić', 'polskę', 'drugi', 'miejsce', 'ue', 'jeśli', 'chodzić', 'przyjmować', 'uchodźców', 'padła', 'liczba', '400', 'osób', 'oczywiście', 'są', 'człowiek', 'ukraina'], ['Manipulacja'])


In [255]:
model_dbow.wv.most_similar('morawiecki')

[('istotny', 0.3358919322490692),
 ('częścią', 0.32700976729393005),
 ('podlaski', 0.31398558616638184),
 ('singapur', 0.2944522798061371),
 ('westerplatte', 0.2781083583831787),
 ('chociażby', 0.2775360345840454),
 ('złożć', 0.27565690875053406),
 ('zatrzymać', 0.2735329866409302),
 ('rak', 0.2683793902397156),
 ('umieścić', 0.2677927315235138)]

## Apply embeddings

In [209]:
df[[ 'e'+str(i) for i in range(vec_size*2) ]] = [new_model.infer_vector(d[0]).tolist() for d in documents]

In [210]:
df_learn = df[
    ['assestment', 'text_clean', 
     'sentiment_all', 'sentiment_avg',
     'uniq_words', 'uniq_lemm', 'err', 'net', 'ADJ', 'ADV', 'NOUN',
     'e0', 'e1', 'e2', 'e3', 'e4',
     'e5', 'e6', 'e7', 'e8', 'e9']
]

In [211]:
df_learn.to_csv('../datasets/scrapped/demagog_features_emb.csv', sep=';', index=False)