In [6]:
import pandas as pd
import numpy as np
import spacy

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from tqdm import tqdm
tqdm.pandas()

nlp_core = spacy.load("pl_core_news_lg")

In [29]:
from sklearn.model_selection import train_test_split

## Load data

In [28]:
# https://medium.com/wisio/a-gentle-introduction-to-doc2vec-db3e8c0cce5e

In [2]:
df = pd.read_csv('../datasets/scrapped/demagog_features.csv', sep=';')

In [3]:
df.head()

Unnamed: 0,assestment,text,text_clean,sentiment_all,sentiment_avg,uniq_words,uniq_lemm,err,net,ADJ,ADV,NOUN
0,Nieweryfikowalne,"Generalnie, jak pokazują dane i szacunki, to n...","Generalnie, jak pokazują dane i szacunki, to ...",-0.008995,-0.083364,33.0,29.0,3.0,1.0,0.212121,0.060606,0.212121
1,Prawda,Według ich (ukraińskich – przyp. Demagog) dany...,"Według ich ukraińskich danych w Polsce, czy n...",-0.008995,-0.008995,18.0,17.0,0.0,3.0,0.055556,0.055556,0.166667
2,Fałsz,"Po pierwsze, jest 51,25 proc. ludzi tylko zasz...","Po pierwsze, jest 51,25 proc. ludzi tylko zasz...",-0.008995,-0.045055,24.0,24.0,0.0,1.0,0.125,0.125,0.083333
3,Prawda,"Po pierwsze, system bankowy w Polsce, no, ma s...","Po pierwsze, system bankowy w Polsce, no, ma s...",0.026141,-0.081037,19.0,19.0,1.0,1.0,0.157895,0.0,0.263158
4,Fałsz,"Magazyny gazu mamy pełne tylko w 60%, bo w lis...","Magazyny gazu mamy pełne tylko w 60%, bo w lis...",-0.024012,-0.024012,23.0,23.0,2.0,2.0,0.086957,0.0,0.173913


## Create tokenizer

In [44]:
stopwords = nlp_core.Defaults.stop_words

In [106]:
def tokenize(txt):
    doc = nlp_core(txt)
    
    words = [
        token.lemma_ 
        for token in doc 
        if 
            not token.is_stop 
            and not token.is_punct 
            and not token.is_stop 
            and token.text != ' '
            and token.lemma_ not in stopwords]
    
    return words

In [109]:
df['tokens'] = df['text_clean'].progress_apply(lambda x: tokenize(x))

100%|██████████████████████████████████████| 4891/4891 [00:37<00:00, 131.59it/s]


## Create documents

In [119]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df['tokens'].values)]

In [120]:
# test/train split

In [121]:
X_train, X_test, y_train, y_test = train_test_split(
    documents, df['assestment'].values, test_size=0.33, random_state=0)

In [122]:
model = Doc2Vec(X_train, vector_size=5, window=2, min_count=1, workers=-1)

In [123]:
model.docvecs[0]

  model.docvecs[0]


array([-0.10461631, -0.11958256, -0.1976151 ,  0.1710569 ,  0.0713223 ],
      dtype=float32)

In [124]:
model.train(X_train, total_examples=model.corpus_count, epochs=model.epochs)

In [125]:
X_test[0]

TaggedDocument(words=['Polska', 'najniższą', 'ilość', 'bezrobotny', 'zdawać', 'się', 'że', 'Litwa'], tags=[751])

In [126]:
model.infer_vector(X_test[0][0]) # generate a vector for an unseen sentence

array([0.07761909, 0.025082  , 0.0071852 , 0.05433891, 0.08226164],
      dtype=float32)

In [137]:
model.infer_vector(X_train[0][0])# generate a vector for an unseen sentence

array([ 0.01312388, -0.09644593, -0.02507035,  0.06522723,  0.05922448],
      dtype=float32)

## Apply embeddings

In [138]:
df[[ 'e'+str(i) for i in range(5) ]] = [model.infer_vector(d[0]).tolist() for d in documents]

In [141]:
df_learn = df[
    ['assestment', 'text_clean', 
     'sentiment_all', 'sentiment_avg',
     'uniq_words', 'uniq_lemm', 'err', 'net', 'ADJ', 'ADV', 'NOUN',
     'e0', 'e1', 'e2', 'e3', 'e4']
]

In [142]:
df_learn.to_csv('../datasets/scrapped/demagog_features_emb.csv', sep=';', index=False)