In [5]:
from polyglot.text import Text

import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=7,progress_bar=True)

from parallelbar import progress_map

from utils import *

r = RDRPOSTagger()

# Load the POS tagging model for French

r.constructSCRDRtreeFromRDRfile("../Models/POS/French.RDR")

# Load the lexicon for French

DICT = readDictionary("../Models/POS/French.DICT")

# Tag a tokenized/word-segmented sentence

r.tagRawSentence(DICT, "Cette annonce a fait l' effet d' une véritable bombe .") 

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Prepear POS CZ data

In [2]:
def extract_features_cz(txt):
    
    doc = Text(txt, hint_language_code='cs')

    txt_pos = []
    
    for token in doc.pos_tags:
        if token[1] not in ['SPACE', 'PUNCT']:
            txt_pos.append(token[1])            
    
    return txt_pos

In [6]:
df_cz_cz = pd.read_csv('../datasets/demagog_nlp_cz/converted-exp-CZ.tsv', sep='\t')

df_cz_cz['text_clean'] = df_cz_cz['statementText'].apply(lambda x: clean_przyp(x))

tasks = df_cz_cz['text_clean'].values.tolist()
result = progress_map(extract_features_cz, tasks, n_cpu=7, chunk_size=1, core_progress=True)

Core 1:   0%|          | 0/1298 [00:00<?, ?it/s]

Core 2:   0%|          | 0/1298 [00:00<?, ?it/s]

Core 3:   0%|          | 0/1298 [00:00<?, ?it/s]

Core 4:   0%|          | 0/1298 [00:00<?, ?it/s]

Core 5:   0%|          | 0/1298 [00:00<?, ?it/s]

Core 6:   0%|          | 0/1298 [00:00<?, ?it/s]

Core 7:   0%|          | 0/1298 [00:00<?, ?it/s]

In [7]:
df_cz_cz['TEXT_POS'] = result

df_cz_cz['TEXT_POS'] = df_cz_cz['TEXT_POS'].str.join(" ")

In [8]:
df_cz_cz['TEXT_POS']

0       PROPN PROPN PROPN PROPN NOUN PROPN NOUN ADP PR...
1       PROPN PROPN PROPN ADV PROPN PROPN CONJ PROPN P...
2       PROPN PRON NOUN PROPN PROPN PROPN PROPN PROPN ...
3       PROPN PROPN PROPN PROPN PRON PRON PROPN PROPN ...
4       PART ADV PROPN ADP PROPN PROPN PROPN PROPN PRO...
                              ...                        
9077                           PRON PROPN PROPN ADP PROPN
9078    PROPN PRON PROPN CONJ NOUN PROPN PROPN NOUN PR...
9079    PRON PROPN PROPN PRON PRON PROPN ADP NOUN NOUN...
9080    PROPN PRON PROPN ADJ NOUN PROPN ADJ NOUN ADP N...
9081    PROPN DET ADJ NOUN PROPN CONJ ADJ NOUN ADJ NUM...
Name: TEXT_POS, Length: 9082, dtype: object

In [9]:
df_cz_cz.to_csv('../datasets/ready2use/fake_news_features_cz_CZ.csv', sep=';', index=False, encoding='utf8')

In [11]:
df_cz_cz.shape

(9082, 10)

## Use CZ data as training

In [18]:
df['statementState'].unique()

array(['TRUE', 'UNVERIFIABLE', 'FALSE', 'MISLEADING', 'null'],
      dtype=object)

In [19]:
df_cz_cz['statementState'] = df_cz_cz['statementState'].str.strip()

df_cz_cz = df_cz_cz[ df_cz_cz['statementState'] != 'MISLEADING' ]
df_cz_cz = df_cz_cz[ df_cz_cz['statementState'] != 'UNVERIFIABLE' ]
df_cz_cz = df_cz_cz[ df_cz_cz['statementState'] != 'null' ]

df = df_cz_cz.reset_index(drop=True)

df['assestment'] = df['statementState'].replace({
    'FALSE' : 0,
#     'Manipulacja' : 1,
    'TRUE' : 1
}).astype(int)

y_train = df.copy()['assestment']
X_train = df.copy().loc[:, df.columns != 'assestment']

In [22]:
y_train.value_counts()

1    5669
0    1222
Name: assestment, dtype: int64