In [21]:
from polyglot.text import Text

import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=7,progress_bar=True)

from parallelbar import progress_map

from utils import *

from nltk import ngrams
import nltk

import scipy
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
import xgboost as xgb

from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    recall_score,
    precision_score,
    roc_auc_score, confusion_matrix, roc_curve, classification_report
)

from sentence_transformers import SentenceTransformer

import random

from datasets import Dataset, DatasetDict
import pandas as pd
import torch

from transformers import AutoTokenizer

from transformers import BertAdapterModel, BertConfig

from transformers import PfeifferConfig

from transformers import TrainingArguments, AdapterTrainer, EvalPrediction
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error

torch.cuda.is_available()

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


True

## POS CZ data

In [2]:
if False:
    df_cz_cz = pd.read_csv('../datasets/demagog_nlp_cz/converted-exp-CZ.tsv', sep='\t')

    df_cz_cz['text_clean'] = df_cz_cz['statementText'].apply(lambda x: clean_przyp(x))

    tasks = df_cz_cz['text_clean'].values.tolist()
    result = progress_map(extract_features_cz, tasks, n_cpu=7, chunk_size=1, core_progress=True)
    
    df_cz_cz['TEXT_POS'] = result

    df_cz_cz['TEXT_POS'] = df_cz_cz['TEXT_POS'].str.join(" ")
    
    df_cz_cz.to_csv('../datasets/ready2use/fake_news_features_cz_CZ.csv', sep=';', index=False, encoding='utf8')
else:
    df_cz_cz = pd.read_csv('../datasets/ready2use/fake_news_features_cz_CZ.csv', sep=';')

In [3]:
df_cz_cz.shape

(9082, 10)

### The same code for SK data

In [4]:
if False:
    df_cz_sk = pd.read_csv('../datasets/demagog_nlp_cz/converted-exp-SK.tsv', sep='\t')

    df_cz_sk['text_clean'] = df_cz_sk['statementText'].apply(lambda x: clean_przyp(x))

    tasks = df_cz_sk['text_clean'].values.tolist()
    result = progress_map(extract_features_cz, tasks, n_cpu=7, chunk_size=1, core_progress=True)
    
    df_cz_sk['TEXT_POS'] = result

    df_cz_sk['TEXT_POS'] = df_cz_sk['TEXT_POS'].str.join(" ")
    
    df_cz_sk.to_csv('../datasets/ready2use/fake_news_features_cz_SK.csv', sep=';', index=False, encoding='utf8')
else:
    df_cz_sk = pd.read_csv('../datasets/ready2use/fake_news_features_cz_SK.csv', sep=';')

In [5]:
df_cz_sk.shape

(12554, 10)

In [6]:
df_cz = pd.concat([df_cz_sk, df_cz_sk])
df_cz.shape

(25108, 10)

In [7]:
df_cz['statementState'] = df_cz['statementState'].str.strip()

df_cz = df_cz[ df_cz['statementState'] != 'MISLEADING' ]
df_cz = df_cz[ df_cz['statementState'] != 'UNVERIFIABLE' ]
df_cz = df_cz[ df_cz['statementState'] != 'null' ]

df_cz = df_cz.reset_index(drop=True)

df_cz['assestment'] = df_cz['statementState'].replace({
    'FALSE' : 0,
#     'Manipulacja' : 1,
    'TRUE' : 1
}).astype(int)

In [8]:
df_cz['assestment'].value_counts()

1    15974
0     3340
Name: assestment, dtype: int64

## POS ENG data

In [10]:
df_en = pd.read_csv('../datasets/politifact/politifact.csv', sep=',', index_col=0)

df_en.loc[:, 'fact'] = df_en['fact'].replace({
    'half-true' : 'true',
    'mostly-true' : 'true',
    'barely-true' : 'false',
    'pants-fire' : 'false',
})

df_en = df_en[df_en['fact'].isin(['true', 'false'])]

In [11]:
df_en.shape

(19151, 11)

In [12]:
df_en = df_en[['sources_quote', 'fact']]
df_en.columns  = [
    'statement',
    'label'
]

df_en['text_clean'] = df_en['statement'].apply(lambda x: clean_przyp(x))

df_en['assestment'] = df_en['label'].replace({
    'false' : 0,
#     'Manipulacja' : 1,
    'true' : 1
}).astype(int)

## Use EN/CZ data as training

In [15]:
df_all = pd.concat([df_en[['assestment', 'text_clean']], 
                    df_cz[['assestment', 'text_clean']]])

In [16]:
df_all['assestment'].value_counts()

1    25033
0    13432
Name: assestment, dtype: int64

In [33]:
df_all['sentence'] = df_all['text_clean'].apply(lambda x: x.replace('\n',' ').strip())
df_all['labels'] = df_all['assestment'].astype(int)

df_all = df_all.sample(frac=1, random_state=111)

df_all.shape

(38465, 4)

### Load pl data

In [34]:
df = pd.read_csv('../datasets/ready2use/text_celan_pl_dataset.csv',
                 index_col=0,
                 sep=';',header=None,names='sentence labels'.split())

df['sentence'] = df['sentence'].apply(lambda x: x.replace('\n',' ').strip())
df['labels'] = df['labels'].astype(int)

df = df.sample(frac=1, random_state=111)

df.shape

(6542, 2)

In [35]:
tok = AutoTokenizer.from_pretrained('sentence-transformers/LaBSE')

In [36]:
labels = sorted(set(df_all['labels']))
num_labels = len(labels)
label2id = dict(zip(labels,range(num_labels)))
id2label = {v: k for k, v in label2id.items()}

def encode_batch(batch):
    """Encodes a batch of input data using the model tokenizer."""
    batch['labels'] = np.array(batch['labels'])
    return tok(text=batch['sentence'], max_length=64, truncation=True, padding="max_length")

In [37]:
config = BertConfig.from_pretrained(
    'sentence-transformers/LaBSE',
    num_labels=num_labels
)

adapter_config = PfeifferConfig()


training_args = TrainingArguments(
    remove_unused_columns=False, # ensure the dataset labels are properly passed to the model
    
    fp16=True,
    learning_rate=1e-4,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    max_grad_norm=1.0,
    label_smoothing_factor=0.0,
    weight_decay=0.05,
    warmup_steps=600,
    output_dir="./training_output",
    overwrite_output_dir=True,
    
#     logging_strategy="no"
    
    logging_steps=200,
    logging_first_step=False,
    logging_dir='./tb_logs',
    evaluation_strategy='steps',
    report_to='tensorboard',
    save_steps=200
)

def compute_accuracy_f1score(p: EvalPrediction):
    preds = [id2label[pred_id] for pred_id in np.argmax(p.predictions, axis=1)]
    target_labels = [id2label[label_id] for label_id in p.label_ids]
    return {"accuracy": accuracy_score(target_labels,preds),
           "f1score": f1_score(target_labels,preds)}

In [None]:
ds = DatasetDict()
ds['train'] = Dataset.from_pandas(df_all)
ds['valid'] = Dataset.from_pandas(df)


ds = ds.map(encode_batch, batched=True, batch_size=len(ds['train']))
ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


model = BertAdapterModel.from_pretrained('sentence-transformers/LaBSE', config=config).to('cuda')

adapter_name='fake_news_multilang_0'

model.add_adapter(adapter_name,config=adapter_config)
model.add_classification_head(
    adapter_name,
    id2label=id2label,
    num_labels=num_labels
  )

model.train_adapter(adapter_name)


trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["valid"],
    compute_metrics=compute_accuracy_f1score,
)

trainer.train()

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.75G [00:00<?, ?B/s]

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 38465
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 48090


Step,Training Loss,Validation Loss,Accuracy,F1score
200,0.6435,0.778136,0.469123,0.629784
400,0.5957,0.833887,0.45506,0.59484
600,0.6024,0.759692,0.489147,0.579199
800,0.6312,0.823785,0.461021,0.581732
1000,0.5852,0.954794,0.46255,0.620548
1200,0.5773,0.782983,0.479823,0.59704
1400,0.5785,0.908749,0.477988,0.642894
1600,0.5706,0.874307,0.481198,0.64221
1800,0.6047,0.821193,0.483491,0.640034
2000,0.5751,0.931993,0.474778,0.641785


***** Running Evaluation *****
  Num examples = 6542
  Batch size = 32
Saving model checkpoint to ./training_output/checkpoint-200
Configuration saved in ./training_output/checkpoint-200/fake_news_multilang_0/adapter_config.json
Module weights saved in ./training_output/checkpoint-200/fake_news_multilang_0/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-200/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-200/fake_news_multilang_0/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-200/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-200/fake_news_multilang_0/pytorch_model_head.bin
***** Running Evaluation *****
  Num examples = 6542
  Batch size = 32
Saving model checkpoint to ./training_output/checkpoint-400
Configuration saved in ./training_output/checkpoint-400/fake_news_multilang_0/adapter_config.json
Module weights saved in ./training_output/checkpo

Module weights saved in ./training_output/checkpoint-2400/fake_news_multilang_0/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-2400/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-2400/fake_news_multilang_0/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-2400/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-2400/fake_news_multilang_0/pytorch_model_head.bin
***** Running Evaluation *****
  Num examples = 6542
  Batch size = 32
Saving model checkpoint to ./training_output/checkpoint-2600
Configuration saved in ./training_output/checkpoint-2600/fake_news_multilang_0/adapter_config.json
Module weights saved in ./training_output/checkpoint-2600/fake_news_multilang_0/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-2600/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-2600/fake_news_multila

Module weights saved in ./training_output/checkpoint-4600/fake_news_multilang_0/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-4600/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-4600/fake_news_multilang_0/pytorch_model_head.bin
***** Running Evaluation *****
  Num examples = 6542
  Batch size = 32
Saving model checkpoint to ./training_output/checkpoint-4800
Configuration saved in ./training_output/checkpoint-4800/fake_news_multilang_0/adapter_config.json
Module weights saved in ./training_output/checkpoint-4800/fake_news_multilang_0/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-4800/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-4800/fake_news_multilang_0/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-4800/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-4800/fake_news_mult

Module weights saved in ./training_output/checkpoint-6800/fake_news_multilang_0/pytorch_model_head.bin
***** Running Evaluation *****
  Num examples = 6542
  Batch size = 32
Saving model checkpoint to ./training_output/checkpoint-7000
Configuration saved in ./training_output/checkpoint-7000/fake_news_multilang_0/adapter_config.json
Module weights saved in ./training_output/checkpoint-7000/fake_news_multilang_0/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-7000/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-7000/fake_news_multilang_0/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-7000/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-7000/fake_news_multilang_0/pytorch_model_head.bin
***** Running Evaluation *****
  Num examples = 6542
  Batch size = 32
Saving model checkpoint to ./training_output/checkpoint-7200
Configuration saved in ./training_ou

Saving model checkpoint to ./training_output/checkpoint-9200
Configuration saved in ./training_output/checkpoint-9200/fake_news_multilang_0/adapter_config.json
Module weights saved in ./training_output/checkpoint-9200/fake_news_multilang_0/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-9200/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-9200/fake_news_multilang_0/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-9200/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-9200/fake_news_multilang_0/pytorch_model_head.bin
***** Running Evaluation *****
  Num examples = 6542
  Batch size = 32
Saving model checkpoint to ./training_output/checkpoint-9400
Configuration saved in ./training_output/checkpoint-9400/fake_news_multilang_0/adapter_config.json
Module weights saved in ./training_output/checkpoint-9400/fake_news_multilang_0/pytorch_adapter.bin
Configurati

Module weights saved in ./training_output/checkpoint-11400/fake_news_multilang_0/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-11400/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-11400/fake_news_multilang_0/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-11400/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-11400/fake_news_multilang_0/pytorch_model_head.bin
***** Running Evaluation *****
  Num examples = 6542
  Batch size = 32
Saving model checkpoint to ./training_output/checkpoint-11600
Configuration saved in ./training_output/checkpoint-11600/fake_news_multilang_0/adapter_config.json
Module weights saved in ./training_output/checkpoint-11600/fake_news_multilang_0/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-11600/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-11600/fake_ne

Configuration saved in ./training_output/checkpoint-13600/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-13600/fake_news_multilang_0/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-13600/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-13600/fake_news_multilang_0/pytorch_model_head.bin
***** Running Evaluation *****
  Num examples = 6542
  Batch size = 32
Saving model checkpoint to ./training_output/checkpoint-13800
Configuration saved in ./training_output/checkpoint-13800/fake_news_multilang_0/adapter_config.json
Module weights saved in ./training_output/checkpoint-13800/fake_news_multilang_0/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-13800/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-13800/fake_news_multilang_0/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-13800/fake_

Module weights saved in ./training_output/checkpoint-15800/fake_news_multilang_0/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-15800/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-15800/fake_news_multilang_0/pytorch_model_head.bin
***** Running Evaluation *****
  Num examples = 6542
  Batch size = 32
Saving model checkpoint to ./training_output/checkpoint-16000
Configuration saved in ./training_output/checkpoint-16000/fake_news_multilang_0/adapter_config.json
Module weights saved in ./training_output/checkpoint-16000/fake_news_multilang_0/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-16000/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-16000/fake_news_multilang_0/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-16000/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-16000/fake

Configuration saved in ./training_output/checkpoint-18000/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-18000/fake_news_multilang_0/pytorch_model_head.bin
***** Running Evaluation *****
  Num examples = 6542
  Batch size = 32
Saving model checkpoint to ./training_output/checkpoint-18200
Configuration saved in ./training_output/checkpoint-18200/fake_news_multilang_0/adapter_config.json
Module weights saved in ./training_output/checkpoint-18200/fake_news_multilang_0/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-18200/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-18200/fake_news_multilang_0/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-18200/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-18200/fake_news_multilang_0/pytorch_model_head.bin
***** Running Evaluation *****
  Num examples = 6542
  Batch si

Module weights saved in ./training_output/checkpoint-20200/fake_news_multilang_0/pytorch_model_head.bin
***** Running Evaluation *****
  Num examples = 6542
  Batch size = 32
Saving model checkpoint to ./training_output/checkpoint-20400
Configuration saved in ./training_output/checkpoint-20400/fake_news_multilang_0/adapter_config.json
Module weights saved in ./training_output/checkpoint-20400/fake_news_multilang_0/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-20400/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-20400/fake_news_multilang_0/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-20400/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-20400/fake_news_multilang_0/pytorch_model_head.bin
***** Running Evaluation *****
  Num examples = 6542
  Batch size = 32
Saving model checkpoint to ./training_output/checkpoint-20600
Configuration saved in ./tr

***** Running Evaluation *****
  Num examples = 6542
  Batch size = 32
Saving model checkpoint to ./training_output/checkpoint-22600
Configuration saved in ./training_output/checkpoint-22600/fake_news_multilang_0/adapter_config.json
Module weights saved in ./training_output/checkpoint-22600/fake_news_multilang_0/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-22600/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-22600/fake_news_multilang_0/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-22600/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-22600/fake_news_multilang_0/pytorch_model_head.bin
***** Running Evaluation *****
  Num examples = 6542
  Batch size = 32
Saving model checkpoint to ./training_output/checkpoint-22800
Configuration saved in ./training_output/checkpoint-22800/fake_news_multilang_0/adapter_config.json
Module weights saved in ./train

Configuration saved in ./training_output/checkpoint-24800/fake_news_multilang_0/adapter_config.json
Module weights saved in ./training_output/checkpoint-24800/fake_news_multilang_0/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-24800/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-24800/fake_news_multilang_0/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-24800/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-24800/fake_news_multilang_0/pytorch_model_head.bin
***** Running Evaluation *****
  Num examples = 6542
  Batch size = 32
Saving model checkpoint to ./training_output/checkpoint-25000
Configuration saved in ./training_output/checkpoint-25000/fake_news_multilang_0/adapter_config.json
Module weights saved in ./training_output/checkpoint-25000/fake_news_multilang_0/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-25000/fake_

Module weights saved in ./training_output/checkpoint-27000/fake_news_multilang_0/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-27000/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-27000/fake_news_multilang_0/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-27000/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-27000/fake_news_multilang_0/pytorch_model_head.bin
***** Running Evaluation *****
  Num examples = 6542
  Batch size = 32
Saving model checkpoint to ./training_output/checkpoint-27200
Configuration saved in ./training_output/checkpoint-27200/fake_news_multilang_0/adapter_config.json
Module weights saved in ./training_output/checkpoint-27200/fake_news_multilang_0/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-27200/fake_news_multilang_0/head_config.json
Module weights saved in ./training_output/checkpoint-27200/fake_ne

In [None]:
eval_out = trainer.evaluate()

print('Acc', eval_out['eval_accuracy'])
print('F1', eval_out['eval_f1score'])