In [9]:
import pandas as pd
import numpy as np
import spacy

from tqdm import tqdm
tqdm.pandas()
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=7,progress_bar=True)

from parallelbar import progress_map

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
from sklearn.model_selection import train_test_split
from sklearn import utils

In [2]:
nlp_pl = spacy.load("pl_core_news_lg") # nlp
# nlp_pl = spacy.load('pl_spacy_model') # nlp37

## Join data

In [3]:
# https://medium.com/wisio/a-gentle-introduction-to-doc2vec-db3e8c0cce5e

In [4]:
df_dem = pd.read_csv('../datasets/scrapped/demagog_features.csv', sep=';')
df_oko = pd.read_csv('../datasets/oko.press/okopress_features.csv', sep=';')

df = pd.concat(
    [
        df_dem[
            ['assestment', 'author', 'text_clean', 'TEXT_POS',
             'sentiment_all', 'sentiment_avg', 'uniq_words', 'uniq_lemm', 
             'err', 'net', 
             'ADJ', 'ADV', 'NOUN', 
             'words_start_upper', 'words_full_upper',
             'exclamation_marks', 'question_marks', 
             'upper_letters', 'chars']
        ],
        (df_oko[
            ['sub_stan_zegara', 'Autor Wypowiedzi', 'text_clean', 'TEXT_POS',
             'sentiment_all', 'sentiment_avg', 'uniq_words', 'uniq_lemm', 
             'err', 'net', 
             'ADJ', 'ADV', 'NOUN', 
             'words_start_upper', 'words_full_upper',
             'exclamation_marks', 'question_marks', 
             'upper_letters', 'chars']
         ]
         .rename(columns={'sub_stan_zegara':'assestment', 'Autor Wypowiedzi':'author'}))
    ],
    ignore_index = True
)

In [5]:
df.to_csv('../datasets/ready2use/fake_news_features_combined.csv', sep=';', index=False, encoding='utf8')

## Create tokenizer

In [6]:
stopwords = nlp_pl.Defaults.stop_words

In [7]:
def tokenize(txt):
    txt = (txt.replace('\n', ' ')
           .replace('ą', 'ą')
           .replace('ć', 'ć')
           .replace('ę', 'ę')
           .replace('ń', 'ń')
           .replace('ó', 'ó')
           .replace('ś', 'ś')
           .replace('ź', 'ź')
           .replace('ż', 'ż'))

    doc = nlp_pl(txt)
    
    words = [
        token.lemma_.lower()
        for token in doc 
        if 
            not token.is_stop 
            and not token.is_punct 
            and not token.is_stop 
            and token.text != ' '
            and token.lemma_ not in stopwords
            and len(token.text) > 1 ]
    
    return words

In [11]:
tasks = df['text_clean'].values.tolist()
result = progress_map(tokenize, tasks, n_cpu=7, chunk_size=1, core_progress=True)

df['tokens'] = result

Core 1:   0%|          | 0/1113 [00:00<?, ?it/s]

Core 2:   0%|          | 0/1113 [00:00<?, ?it/s]

Core 3:   0%|          | 0/1113 [00:00<?, ?it/s]

Core 4:   0%|          | 0/1113 [00:00<?, ?it/s]

Core 5:   0%|          | 0/1113 [00:00<?, ?it/s]

Core 6:   0%|          | 0/1113 [00:00<?, ?it/s]

Core 7:   0%|          | 0/1113 [00:00<?, ?it/s]

In [12]:
# df['tokens_str'] = df['tokens'].progress_apply(
#     lambda x: ' '.join(x))

tasks = df['tokens'].values.tolist()
result = progress_map(' '.join, tasks, n_cpu=7, chunk_size=1, core_progress=True)

df['tokens_str'] = result

Core 1:   0%|          | 0/1113 [00:00<?, ?it/s]

Core 2:   0%|          | 0/1113 [00:00<?, ?it/s]

Core 3:   0%|          | 0/1113 [00:00<?, ?it/s]

Core 4:   0%|          | 0/1113 [00:00<?, ?it/s]

Core 5:   0%|          | 0/1113 [00:00<?, ?it/s]

Core 6:   0%|          | 0/1113 [00:00<?, ?it/s]

Core 7:   0%|          | 0/1113 [00:00<?, ?it/s]

In [14]:
df.columns

Index(['assestment', 'author', 'text_clean', 'TEXT_POS', 'sentiment_all',
       'sentiment_avg', 'uniq_words', 'uniq_lemm', 'err', 'net', 'ADJ', 'ADV',
       'NOUN', 'words_start_upper', 'words_full_upper', 'exclamation_marks',
       'question_marks', 'upper_letters', 'chars', 'tokens', 'tokens_str'],
      dtype='object')

In [15]:
df[
    ['assestment', 'author', 'text_clean', 
     'sentiment_all', 'sentiment_avg', 'uniq_words', 'uniq_lemm', 'err', 'net', 
     'ADJ', 'ADV', 'NOUN', 'tokens_str']
].to_csv('../datasets/ready2use/fake_news_features_tokens.csv', sep=';', index=False, encoding='utf8')