In [47]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import gensim
import spacy
import nltk

from sentimentpl.models import SentimentPLModel

import re
import unicodedata

from tqdm import tqdm
tqdm.pandas()

import sys
sys.path.append('..')

from helpers.dataset import (
    deal_with_polish_sign,
    clean_text,
    get_stylometric_features
)

nlp_core = spacy.load("pl_core_news_lg")
model_sent = SentimentPLModel(from_pretrained='latest')
stopwords = nlp_core.Defaults.stop_words

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Read data & clean text

In [48]:
df = pd.read_csv('../datasets/demagog_nlp_cz/converted-exp-PL.tsv', sep='\t')

df['text_clean'] = df['statementText'].apply(lambda x: clean_text(x))
df['text_clean'] = df['text_clean'].apply(lambda x: deal_with_polish_sign(x))

# Filter not TRUE/FALSE cases

In [49]:
df['statementState'].value_counts()

statementState
TRUE            1761
FALSE            648
MISLEADING       313
UNVERIFIABLE     113
Name: count, dtype: int64

In [50]:
df = df[ df['statementState'] != 'MISLEADING' ]
df = df[ df['statementState'] != 'UNVERIFIABLE' ]

df = df.reset_index(drop=True)

df['assestment'] = df['statementState'].replace({
    'FALSE' : 1,
    'TRUE' : 0
}).astype(int)

# Stylommetric features

In [51]:
df = get_stylometric_features(df,  nlp_core, model_sent, stopwords, 'text_clean', rerun_all=False)

## Add WORDS ##


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2409/2409 [00:17<00:00, 138.19it/s]


## Add POS ##


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2409/2409 [00:16<00:00, 142.52it/s]


In [52]:
df.columns

Index(['politicianID', 'name', 'party', 'statementID', 'statementText',
       'statementState', 'statementExplanClean', 'statementExplan',
       'text_clean', 'assestment', 'TEXT_WORD', 'TEXT_POS'],
      dtype='object')

## Save stylommetric features

In [55]:
target_column = ['assestment']

stylo_features = [
    'avg_word_len', 'n_words', 'n_char', 'n_special_char',
    'avg_n_vowels_per_word', 'hapax_legomena', 'hapax_dislegemena',
    'honore_r', 'sichel_s', 'brunet_w', 'yule_k', 'shannon_entropy',
    'simpson_idx_d', 'type_token_ratio', 'FR_score', 'FKG_level',
    'Gunning_Fog_index', 'sentiment_all', 'sentiment_avg', 'n_stop_words',
    'n_ent', 'p_adj', 'n_adj', 'p_adv', 'n_adv', 'p_noun', 'n_noun'
]

cols_for_other_f = ['text_clean', 'TEXT_WORD', 'TEXT_POS']

In [56]:
# df[target_column+stylo_features].to_parquet('../datasets/used_data/02_classical_ml/02_01_benchmark_styllometric_features.parquet')

# Ngrams POS

In [57]:
n_grams = 5
min_pos = 5 

In [59]:
df_pos = df[target_column].copy()

words =  sum(df['TEXT_POS'].str.split(' ').values.tolist(), [])

n_list = []
for n in range(n_grams):
    n_i = pd.Series(nltk.ngrams(words, n+1)).value_counts()
    n_i = n_i[n_i>min_pos]
    n_list.append(n_i)

n_iterator = []
for n_i in n_list:
    n_iterator += n_i.index.tolist()
    
col = {}
    
for n in tqdm(n_iterator):
    x = df['TEXT_POS'].str.count(' '.join(n)) / df['TEXT_POS'].str.split(' ').str.len()

    col[' '.join(n)] = x
    col[' '.join(n)].name = ' '.join(n)
            
df_pos = pd.concat( [df_pos] + list( col.values() ), axis=1 )

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4649/4649 [00:24<00:00, 190.59it/s]


In [64]:
df_pos.shape

(2409, 4650)

In [66]:
df_pos.to_parquet('../datasets/used_data/02_classical_ml/02_02_benchmark_POS_ngrams.parquet')

# Ngram words

In [68]:
df_ngram = df[target_column + ['TEXT_WORD']].copy()

words =  sum(df['TEXT_WORD'].str.split(' ').values.tolist(), [])

n_list = []
for n in range(n_grams):
    n_i = pd.Series(nltk.ngrams(words, n+1)).value_counts()
    n_i = n_i[n_i>min_pos]
    n_list.append(n_i)

n_iterator = []
for n_i in n_list:
    n_iterator += n_i.index.tolist()

col = {}
    
for n in tqdm(n_iterator):
    x = df['TEXT_WORD'].str.count(' '.join(n)) / df['TEXT_WORD'].str.split(' ').str.len()

    col[' '.join(n)] = x
    col[' '.join(n)].name = ' '.join(n)
            
df_ngram = pd.concat( [df_ngram] + list( col.values() ), axis=1 )

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1172/1172 [00:05<00:00, 206.61it/s]


In [69]:
df_ngram.shape

(2409, 1174)

In [70]:
df_ngram.to_parquet('../datasets/used_data/02_classical_ml/02_03_benchmark_words_ngrams.parquet')