In [16]:
import pandas as pd
import numpy as np
import spacy

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec

import morfeusz2

import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()

from sentence_transformers import SentenceTransformer
import umap
import hdbscan

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

import re

In [17]:
from sklearn.model_selection import train_test_split
from sklearn import utils

In [18]:
nlp_pl = spacy.load("pl_core_news_lg") # nlp
# nlp_pl = spacy.load('pl_spacy_model') # nlp37

In [19]:
# model = SentenceTransformer('dkleczek/bert-base-polish-uncased-v1')

## Load data

In [20]:
# https://medium.com/wisio/a-gentle-introduction-to-doc2vec-db3e8c0cce5e

In [21]:
df_dem = pd.read_csv('../datasets/scrapped/demagog_features.csv', sep=';')
df_dem.head()

Unnamed: 0,assestment,text,author,source,text_clean,sentiment_all,sentiment_avg,uniq_words,uniq_lemm,err,net,ADJ,ADV,NOUN
0,Częściowy fałsz,W ubiegłym roku 257 osób zginęło w wypadkach...,Paulina Matysiak,"Poranna rozmowa w RMF FM, 18.05.2022",W ubiegłym roku 257 osób zginęło w wypadkach...,-0.230477,-0.207067,12.0,11.0,2.0,0.0,0.25,0.0,0.416667
1,Prawda,(...) udzielenie bankowi centralnemu Ukrainy t...,Piotr Ćwik,"54. posiedzenie Sejmu RP, 12.05.2022",udzielenie bankowi centralnemu Ukrainy tzw. s...,-0.008995,-0.008995,29.0,28.0,2.0,3.0,0.275862,0.0,0.310345
2,Prawda,Polska już w grudniu (2021 roku – przyp. Dema...,Marcin Kierwiński,Śniadanie Rymanowskiego w Polsat News i Inter...,Polska już w grudniu 2021 roku miała trzecią...,-0.008995,-0.008995,17.0,16.0,1.0,4.0,0.117647,0.0,0.470588
3,Prawda,(Bartosz Cichocki – przyp. Demagog) był jedyny...,Arkady Rzegocki,"Kwadrans polityczny, 26.04.2022",Bartosz Cichocki był jedynym ambasadorem z kra...,0.282137,0.139427,20.0,20.0,2.0,3.0,0.25,0.05,0.25
4,Fałsz,Będąc (prezes NBP Adam Glapiński – przyp. D...,Dariusz Rosati,"Posiedzenie Sejmu, 12.05.2022",Będąc prezes NBP Adam Glapiński przez 6 lat...,-0.008995,-0.008995,37.0,37.0,4.0,4.0,0.108108,0.054054,0.324324


In [22]:
df_oko = pd.read_csv('../datasets/oko.press/okopress_features.csv', sep=';')
df_oko.head()

Unnamed: 0,sub_napis,sub_stan_zegara,Autor Wypowiedzi,text_clean,sentiment_all,sentiment_avg,uniq_words,uniq_lemm,err,net,ADJ,ADV,NOUN
0,Rządy Tuska to również doprowadzenie do wyzysk...,falsz,Stanisław Piotrowicz,Rządy Tuska to również doprowadzenie do wyz...,-0.008995,-0.195652,28.0,27.0,4.0,5.0,0.071429,0.071429,0.285714
1,Absurdy i marnotrawstwo pokazuje najlepiej pro...,blisko_prawdy,Antoni Macierewicz,Absurdy i marnotrawstwo pokazuje najlepiej pro...,-0.188746,-0.188746,26.0,25.0,2.0,1.0,0.038462,0.038462,0.423077
2,Przez 15 lat finansowaliście budowę korwety Ga...,blisko_prawdy,Antoni Macierewicz,Przez 15 lat finansowaliście budowę korwety ...,-0.201911,-0.159423,35.0,35.0,3.0,3.0,0.085714,0.028571,0.371429
3,Dzisiaj ponad 65 procent długu państwowego jes...,falsz,Mateusz Morawiecki,Dzisiaj ponad 65 procent długu państwowego je...,-0.008995,-0.008995,23.0,23.0,2.0,0.0,0.173913,0.043478,0.26087
4,"Polska jest gotowa przyjąć każdego uchodźcę, k...",falsz,Andrzej Duda,Polska jest gotowa przyjąć każdego uchodźc...,-0.007072,-0.007072,34.0,34.0,4.0,3.0,0.147059,0.058824,0.235294


In [23]:
df_oko.shape

(2869, 13)

In [24]:
df = pd.concat(
    [
        df_dem[['assestment', 'author', 'text_clean', 'sentiment_all', 'sentiment_avg', 'uniq_words', 'uniq_lemm', 'err', 'net', 'ADJ', 'ADV','NOUN']],
        (df_oko[['sub_stan_zegara', 'Autor Wypowiedzi', 'text_clean', 'sentiment_all', 'sentiment_avg', 'uniq_words', 'uniq_lemm', 'err', 'net', 'ADJ', 'ADV','NOUN']]
         .rename(columns={'sub_stan_zegara':'assestment', 'Autor Wypowiedzi':'author'}))
    ],
    ignore_index = True
)

## Create tokenizer

In [25]:
# stopwords = nlp_core.Defaults.stop_words
stopwords = nlp_pl.Defaults.stop_words

In [104]:
def tokenize(txt):
    txt = (txt.replace('\n', ' ')
           .replace('ą', 'ą')
           .replace('ć', 'ć')
           .replace('ę', 'ę')
           .replace('ń', 'ń')
           .replace('ó', 'ó')
           .replace('ś', 'ś')
           .replace('ź', 'ź')
           .replace('ż', 'ż'))

    doc = nlp_pl(txt)
    
    words = [
        token.lemma_.lower()
        for token in doc 
        if 
            not token.is_stop 
            and not token.is_punct 
            and not token.is_stop 
            and token.text != ' '
            and token.lemma_ not in stopwords
            and len(token.text) > 1 ]
    
    return words

In [105]:
df['tokens'] = df['text_clean'].progress_apply(lambda x: tokenize(x))

100%|██████████████████████████████████████| 7786/7786 [01:16<00:00, 102.39it/s]


In [106]:
df['tokens_str'] = df['tokens'].progress_apply(
    lambda x: ' '.join(x))

100%|███████████████████████████████████| 7786/7786 [00:00<00:00, 614798.20it/s]


In [107]:
df.columns

Index(['assestment', 'author', 'text_clean', 'sentiment_all', 'sentiment_avg',
       'uniq_words', 'uniq_lemm', 'err', 'net', 'ADJ', 'ADV', 'NOUN', 'tokens',
       'tokens_str'],
      dtype='object')

In [108]:
df[
    ['assestment', 'author', 'text_clean', 
     'sentiment_all', 'sentiment_avg', 'uniq_words', 'uniq_lemm', 'err', 'net', 
     'ADJ', 'ADV', 'NOUN', 'tokens_str']
].to_csv('../datasets/ready2use/fake_news_features_tokens.csv', sep=';', index=False, encoding='utf8')

In [109]:
df['tokens'].str.join('|').str.get_dummies().max().max()

1