In [2]:
from polyglot.text import Text

import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=7,progress_bar=True)

from parallelbar import progress_map

from utils import *

from nltk import ngrams
import nltk

import scipy
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
import xgboost as xgb

from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    recall_score,
    precision_score,
    roc_auc_score, confusion_matrix, roc_curve, classification_report
)


INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  from pandas import MultiIndex, Int64Index


## POS CZ data

In [3]:
def extract_features_cz(txt):
    
    doc = Text(txt, hint_language_code='cs')

    txt_pos = []
    
    for token in doc.pos_tags:
        if token[1] not in ['SPACE', 'PUNCT']:
            txt_pos.append(token[1])            
    
    return txt_pos

In [4]:
if False:
    df_cz_cz = pd.read_csv('../datasets/demagog_nlp_cz/converted-exp-CZ.tsv', sep='\t')

    df_cz_cz['text_clean'] = df_cz_cz['statementText'].apply(lambda x: clean_przyp(x))

    tasks = df_cz_cz['text_clean'].values.tolist()
    result = progress_map(extract_features_cz, tasks, n_cpu=7, chunk_size=1, core_progress=True)
    
    df_cz_cz['TEXT_POS'] = result

    df_cz_cz['TEXT_POS'] = df_cz_cz['TEXT_POS'].str.join(" ")
    
    df_cz_cz.to_csv('../datasets/ready2use/fake_news_features_cz_CZ.csv', sep=';', index=False, encoding='utf8')
else:
    df_cz_cz = pd.read_csv('../datasets/ready2use/fake_news_features_cz_CZ.csv', sep=';')

### The same code for SK data

In [5]:
if True:
    df_cz_sk = pd.read_csv('../datasets/demagog_nlp_cz/converted-exp-SK.tsv', sep='\t')

    df_cz_sk['text_clean'] = df_cz_sk['statementText'].apply(lambda x: clean_przyp(x))

    tasks = df_cz_sk['text_clean'].values.tolist()
    result = progress_map(extract_features_cz, tasks, n_cpu=7, chunk_size=1, core_progress=True)
    
    df_cz_sk['TEXT_POS'] = result

    df_cz_sk['TEXT_POS'] = df_cz_sk['TEXT_POS'].str.join(" ")
    
    df_cz_sk.to_csv('../datasets/ready2use/fake_news_features_cz_SK.csv', sep=';', index=False, encoding='utf8')
else:
    df_cz_sk = pd.read_csv('../datasets/ready2use/fake_news_features_cz_SK.csv', sep=';')

Core 1:   0%|          | 0/1794 [00:00<?, ?it/s]

Core 2:   0%|          | 0/1794 [00:00<?, ?it/s]

Core 3:   0%|          | 0/1794 [00:00<?, ?it/s]

Core 4:   0%|          | 0/1794 [00:00<?, ?it/s]

Core 5:   0%|          | 0/1794 [00:00<?, ?it/s]

Core 6:   0%|          | 0/1794 [00:00<?, ?it/s]

Core 7:   0%|          | 0/1794 [00:00<?, ?it/s]

In [6]:
df_cz_sk.shape

(12554, 10)

In [7]:
df_cz_cz.shape

(9082, 10)

In [8]:
df_cz = pd.concat([df_cz_sk, df_cz_sk])
df_cz.shape

(25108, 10)

In [9]:
df_cz['statementState'] = df_cz['statementState'].str.strip()

df_cz = df_cz[ df_cz['statementState'] != 'MISLEADING' ]
df_cz = df_cz[ df_cz['statementState'] != 'UNVERIFIABLE' ]
df_cz = df_cz[ df_cz['statementState'] != 'null' ]

df_cz = df_cz.reset_index(drop=True)

df_cz['assestment'] = df_cz['statementState'].replace({
    'FALSE' : 0,
#     'Manipulacja' : 1,
    'TRUE' : 1
}).astype(int)

In [10]:
df_cz['assestment'].value_counts()

1    15974
0     3340
Name: assestment, dtype: int64

## POS ENG data

In [11]:
nlp_core_en = spacy.load("en_core_web_lg")
def extract_features_en(txt, nlp_core=nlp_core_en):
    
    doc = nlp_core(txt)
    
    txt_pos = []
    for token in doc:
        if token.pos_ not in ['SPACE', 'PUNCT']:
            txt_pos.append(token.pos_)            
    
    return txt_pos

In [12]:
df_en = pd.read_csv('../datasets/politifact/politifact.csv', sep=',', index_col=0)

df_en.loc[:, 'fact'] = df_en['fact'].replace({
    'half-true' : 'true',
    'mostly-true' : 'true',
    'barely-true' : 'false',
    'pants-fire' : 'false',
})

df_en = df_en[df_en['fact'].isin(['true', 'false'])]

df_en = df_en[['sources_quote', 'fact']]
df_en.columns  = [
    'statement',
    'label'
]

df_en['text_clean'] = df_en['statement'].apply(lambda x: clean_przyp(x))

tasks = df_en['text_clean'].values.tolist()
result = progress_map(extract_features_en, tasks, n_cpu=7, chunk_size=1, core_progress=True)

df_en['TEXT_POS'] = result

df_en['TEXT_POS'] = df_en['TEXT_POS'].str.join(" ")

df_en['assestment'] = df_en['label'].replace({
    'false' : 0,
#     'Manipulacja' : 1,
    'true' : 1
}).astype(int)

Core 1:   0%|          | 0/2736 [00:00<?, ?it/s]

Core 2:   0%|          | 0/2736 [00:00<?, ?it/s]

Core 3:   0%|          | 0/2736 [00:00<?, ?it/s]

Core 4:   0%|          | 0/2736 [00:00<?, ?it/s]

Core 5:   0%|          | 0/2736 [00:00<?, ?it/s]

Core 6:   0%|          | 0/2736 [00:00<?, ?it/s]

Core 7:   0%|          | 0/2736 [00:00<?, ?it/s]

In [13]:
# df_en = pd.concat(
# [
#     pd.read_csv('../datasets/liar/train.tsv', sep='\t', header=None),
#     pd.read_csv('../datasets/liar/valid.tsv', sep='\t', header=None),
#     pd.read_csv('../datasets/liar/test.tsv', sep='\t', header=None)
# ]
# ).iloc[:,:3]

# df_en.columns = [
#     'id',
#     'label',
#     'statement'
# ]

# df_en.loc[:, 'label'] = df_en['label'].replace({
#     'half-true' : 'true',
#     'mostly-true' : 'true',
#     'barely-true' : 'false',
#     'pants-fire' : 'false',
# })

## Use EN/CZ data as training

In [14]:
df_en.columns

Index(['statement', 'label', 'text_clean', 'TEXT_POS', 'assestment'], dtype='object')

In [15]:
df_cz.columns

Index(['politicianID', 'name', 'party', 'statementID', 'statementText',
       'statementState', 'statementExplanClean', 'statementExplan    ',
       'text_clean', 'TEXT_POS', 'assestment'],
      dtype='object')

In [16]:
df_all = pd.concat([df_en[['assestment', 'TEXT_POS']], df_cz[['assestment', 'TEXT_POS']]])

In [17]:
df_all['assestment'].value_counts()

1    25033
0    13432
Name: assestment, dtype: int64

In [18]:
y_train = df_all.copy()['assestment']
X_train = df_all.copy()[['TEXT_POS']]

In [19]:
# y_train = df_cz.copy()['assestment']
# X_train = df_cz.copy().loc[:, df_cz.columns != 'assestment']

In [20]:
n_grams = 5
min_pos = 5 

In [21]:
X_pos = X_train[['TEXT_POS']].copy()

words =  sum(X_pos['TEXT_POS'].str.split(' ').values.tolist(), [])

n_list = []
for n in range(n_grams):
    n_i = pd.Series(nltk.ngrams(words, n+1)).value_counts()
    n_i = n_i[n_i>min_pos]
    n_list.append(n_i)

n_iterator = []
for n_i in n_list:
    n_iterator += n_i.index.tolist()

In [22]:
def get_pos_features_para(n):
    x = X_pos['TEXT_POS'].str.count(' '.join(n)) / X_pos['TEXT_POS'].str.split(' ').str.len()
    min_v = x.min()
    max_v = x.max()

    if min_v < max_v:
        r = scipy.stats.pearsonr(x, y_train)[0]
        if ~np.isnan(r) and np.abs(r) > 0.01: 
            return x
    
    return None

result = progress_map(get_pos_features_para, n_iterator, n_cpu=7, chunk_size=1, core_progress=True)

Core 1:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 2:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 3:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 4:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 5:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 6:   0%|          | 0/3911 [00:00<?, ?it/s]

Core 7:   0%|          | 0/3911 [00:00<?, ?it/s]

In [23]:
col = {}
    
for x, n in tqdm(zip(result, n_iterator)):
    if x is not None:
        col[' '.join(n)] = x
        col[' '.join(n)].name = ' '.join(n)

27373it [00:00, 624639.88it/s]


In [24]:
X_pos = pd.concat( [X_pos] + list( col.values() ), axis=1 ).drop('TEXT_POS', axis=1)

In [25]:
out_pos = X_pos.join(y_train)

In [26]:
# out_pos.to_csv('../datasets/ready2use/pos_en_cz_train_dataset.csv',  
#                chunksize=100000,
#                compression='gzip',
#                sep=';', index=False, encoding='utf8')

In [31]:
# out_pos.to_hdf(r'../datasets/ready2use/pos_en_cz_train.h5', key='stage', mode='w')

In [29]:
# np.savetxt('../datasets/ready2use/pos_en_cz_train_header.csv', out_pos.columns.values, delimiter=";", fmt="%s")
# np.savetxt('../datasets/ready2use/pos_en_cz_train_data.csv', out_pos.values, delimiter=";")

### Read training data

In [None]:
# X_pos = pd.read_hdf('../datasets/ready2use/pos_en_cz_train.h5', key='stage', mode='r')
# y_train = X_pos['assestment']
# X_pos = X_pos.drop('assestment', axis=1)

In [95]:
col_keep = []
for c in tqdm(X_pos.columns):
    min_v = X_pos[c].values.min()
    max_v = X_pos[c].values.max()

    if min_v < max_v:
        r = scipy.stats.pearsonr(X_pos[c].values, y_train)[0]
        if ~np.isnan(r) and np.abs(r) > 0.1:
            col_keep.append(c)
            
len(col_keep)

100%|█████████████████████████████████████| 9219/9219 [00:07<00:00, 1212.12it/s]


87

### Load test data

In [85]:
y_train.reset_index(drop=True, inplace=True)
X_pos.reset_index(drop=True, inplace=True)

In [32]:
X_test = pd.read_csv('../datasets/ready2use/pos_pl_dataset.csv', sep=';')
y_test = X_test['assestment']
X_test = X_test.drop('assestment', axis=1)

### Keep cols

In [96]:
col_keep_test = np.array(col_keep)[np.isin(col_keep, X_test.columns.values)]
col_keep_test.shape

(69,)

In [97]:
n_lower = y_train.value_counts().min()
n_upper = y_train.value_counts().max()

np.random.seed(111)

# undersampling    
index_0 = np.random.choice(y_train[y_train==0].index, n_lower, replace=False)
index_1 = np.random.choice(y_train[y_train==1].index, n_lower, replace=False)

y_train_u = y_train.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()

X_pos_u = X_pos.iloc[ index_0.tolist()+index_1.tolist() ].sort_index()

In [117]:
y_train.value_counts()

1    25033
0    13432
Name: assestment, dtype: int64

In [129]:
class_weight = {
    1 : 1,
    0 : 1 # n_upper/n_lower
}

In [130]:
import warnings
warnings.filterwarnings("ignore")

scoring = ['accuracy', 
           'precision', 'recall', 
           'f1']

clf_lr = LogisticRegression(max_iter=5000, C=1, penalty='l2', solver='liblinear', 
                            class_weight=class_weight)
clf_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=3, random_state=111)
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=111)
clf_xgb = xgb.XGBClassifier(objective='binary:logistic', verbosity=0, silent=True)

for c, n in zip(
    [clf_lr, 
#      clf_gb, 
     clf_rf, 
#      clf_xgb, 
    ], 
    ['log_regC1', 
#      'grd_boost', 
     'rand_frst', 
#      'xgboost  ', 
    ]
):
    results = cross_validate(estimator=c,
                           X=X_pos[col_keep_test],
                           y=y_train,
                           cv=5,
                           scoring=scoring,
                           return_train_score=True)
    print(
        n,
        f'Accuracy {results["test_accuracy"].mean():.3f}',
        f'Precision {results["test_precision"].mean():.3f}',
        f'Recall {results["test_recall"].mean():.3f}',
        f'F1 Score {results["test_f1"].mean():.3f}',
    )

log_regC1 Accuracy 0.588 Precision 0.689 Recall 0.678 F1 Score 0.600
rand_frst Accuracy 0.557 Precision 0.664 Recall 0.660 F1 Score 0.568


In [131]:
# clf_v = VotingClassifier(estimators=[
#     ('lr', clf_lr), 
#     ('gb', clf_gb), 
#     ('rf', clf_rf), 
# #     ('xgb', clf_xgb)
# ], voting='hard')

# results = cross_validate(estimator=clf_v,
#                        X=X_pos[col_keep_test],
#                        y=y_train,
#                        cv=5,
#                        scoring=scoring,
#                        return_train_score=True)
# print(
#     'voting  ',
#     f'Accuracy {results["test_accuracy"].mean():.3f}',
#     f'Precision {results["test_precision"].mean():.3f}',
#     f'Recall {results["test_recall"].mean():.3f}',
#     f'F1 Score {results["test_f1"].mean():.3f}',
# )

In [132]:
clf_out = clf_lr

## Test on PL data

### train -> test

In [133]:
clf_out.fit(X_pos[col_keep_test], y_train)

In [134]:
# scores = clf_lr.predict_proba(X_pos[col_keep_test])[:,1]
# fpr, tpr, thresholds = roc_curve(y_train, scores)

# import matplotlib.pyplot as plt
# plt.title('Receiver Operating Characteristic')
# plt.plot(fpr, tpr, 'b')
# plt.legend(loc = 'lower right')
# plt.plot([0, 1], [0, 1],'r--')
# plt.xlim([0, 1])
# plt.ylim([0, 1])
# plt.ylabel('True Positive Rate')
# plt.xlabel('False Positive Rate')
# plt.show()

In [135]:
# y_pred_lr = clf_lr.predict(X_test[col_keep_test])
# y_pred_gb = clf_gb.predict(X_test[col_keep_test])
# y_pred_rf = clf_rf.predict(X_test[col_keep_test])

# y_pred = []
# for y1, y2, y3 in zip(y_pred_lr, y_pred_gb, y_pred_rf):
#     if y1+y2+y3 > 1:
#         y_pred.append(1)
#     else:
#         y_pred.append(0)

y_pred = clf_out.predict(X_test[col_keep_test])

In [136]:
confusion_matrix(y_test, y_pred)

array([[ 326, 3112],
       [ 184, 2919]])

In [137]:
print(
    n, '\n\t',
    f'Accuracy  {accuracy_score(y_test, y_pred):.3f}\n\t',
    f'Precision {precision_score(y_test, y_pred):.3f}\n\t',
    f'Recall    {recall_score(y_test, y_pred):.3f}\n\t',
    f'F1 Score  {f1_score(y_test, y_pred):.3f}\n\t',
)

rand_frst 
	 Accuracy  0.496
	 Precision 0.484
	 Recall    0.941
	 F1 Score  0.639
	
