In [1]:
# import time
# import torch
import random
import numpy as np
import pandas as pd
# import torch.nn as nn
# import torch.optim as optim
# from tqdm.notebook import tqdm
# import matplotlib.pyplot as plt

# from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.base import clone as sklearn_clone

from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    recall_score,
    precision_score,
    roc_auc_score, confusion_matrix, roc_curve, classification_report
)

# from sentence_transformers import SentenceTransformer
# from transformers import AutoTokenizer, AutoModel, HerbertTokenizer, BatchEncoding

# import gc
# from sklearn.neighbors import KNeighborsClassifier

# import scipy
# from scipy import spatial

# import umap.umap_ as umap

import spacy
import stylo_metrix

nlp = spacy.load('pl_nask_large')         # for Polish
nlp.add_pipe("stylo_metrix")

  from .autonotebook import tqdm as notebook_tqdm


<stylo_metrix.pipeline.stylo_metrix_pipe.StyloMetrixPipe at 0x7fc53013b2e0>

In [2]:
np.random.seed(111)
random.seed(111)

## Get data

In [3]:
df_topics = pd.read_csv('../datasets/ready2use/topics.csv', index_col=0)

In [4]:
df = pd.read_csv('../datasets/ready2use/fake_news_features_combined.csv', sep=';')

df = df[ df['assestment'] != 'brak' ]

df.loc[:, 'assestment'] = df['assestment'].replace({
    'falsz' : 'Fałsz',
    'zbity_zegar' : 'Fałsz',
    'raczej_falsz' : 'Fałsz',
    'prawda' : 'Prawda',
    'blisko_prawdy' : 'Prawda',
    'polprawda' : 'Manipulacja',
    'Częściowy fałsz' : 'Manipulacja'
})

df = df[ df['assestment'] != 'Nieweryfikowalne' ]
df = df[ df['assestment'] != 'Manipulacja' ]

df['assestment'] = df['assestment'].replace({
    'Fałsz' : 0,
#     'Manipulacja' : 1,
    'Prawda' : 1
}).astype(int)

df = df.copy()[['assestment', 'text_clean']][df.index.isin(df_topics.index)].reset_index(drop=True)

## Kfold

In [5]:
cv_fold = []
cv_fold_i = []

for i in df_topics['topic'].unique().reshape(10,-1):
    train_cv = df_topics.index[ ~np.isin(df_topics["topic"], [i, np.mod(i+1,10)]) ].values
    val_cv = df_topics.index[ np.isin(df_topics["topic"], np.mod(i+1,10)) ].values
    test_cv = df_topics.index[ np.isin(df_topics["topic"], i) ].values
    
    train_cv_i = df_topics.reset_index().index[ ~np.isin(df_topics["topic"], [i, np.mod(i+1,10)]) ].values
    val_cv_i = df_topics.reset_index().index[ np.isin(df_topics["topic"], np.mod(i+1,10)) ].values
    test_cv_i = df_topics.reset_index().index[ np.isin(df_topics["topic"], i) ].values
    
    cv_fold.append( [train_cv, val_cv, test_cv])
    cv_fold_i.append( [train_cv_i, val_cv_i, test_cv_i])

In [6]:
kf = KFold(n_splits=10, shuffle=True)
kf.get_n_splits(df_topics)

cv_Kfold = []
cv_Kfold_i = []

for train_index, test_index in kf.split(df_topics):
    train_index, val_index = train_test_split(train_index, test_size=1/9, shuffle=True)
    train_cv = df_topics.iloc[ train_index, : ].index.values
    val_cv = df_topics.iloc[ val_index, : ].index.values
    test_cv = df_topics.iloc[ test_index, : ].index.values

    train_cv_i= df_topics.reset_index().iloc[ train_index, : ].index.values
    val_cv_i = df_topics.reset_index().iloc[ val_index, : ].index.values
    test_cv_i = df_topics.reset_index().iloc[ test_index, : ].index.values
    
    cv_Kfold.append( [train_cv, val_cv, test_cv])
    cv_Kfold_i.append( [train_cv_i, val_cv_i, test_cv_i])

## Train

In [7]:
txt = df['text_clean'].values[0]

In [8]:
def get_vec_stylo(txt):
    doc = nlp(txt)
    vec = []
    for metric in doc._.smv:
        vec.append(metric['value'])
    return vec

In [10]:
if False:
    emb_style = [get_vec_stylo(t) for t in tqdm(df['text_clean'].values, position=0, leave=True)]
    
    with open('../datasets/ready2use/style_emb_pl.npy', 'wb') as f:
        np.save(f, np.array(emb_style))
else:
    with open('../datasets/ready2use/style_emb_pl.npy', 'rb') as f:
        emb_style = np.load(f) 

In [11]:
emb_style.shape

(6541, 89)

In [12]:
clf_lr_1 = LogisticRegression(max_iter=5000, C=1, penalty='l2', solver='liblinear')

In [13]:
train_index, val_index, test_index = cv_fold_i[np.random.randint(10)]

In [15]:
results = {
    'test_accuracy' : [],
    'test_precision' : [],
    'test_recall' : [],
    'test_f1' : []
}


y_train_t = df['assestment'].values[train_index]
X_train_t = np.array(emb_style)[train_index,:]
y_test_t = df['assestment'].values[test_index]
X_test_t = np.array(emb_style)[test_index,:]

clf_lr_1.fit(X_train_t, y_train_t)

y_pred = clf_lr_1.predict(X_test_t)

results['test_accuracy'].append( accuracy_score(y_test_t, y_pred) ) 
results['test_precision'].append( precision_score(y_test_t, y_pred) ) 
results['test_recall'].append( recall_score(y_test_t, y_pred) ) 
results['test_f1'].append( f1_score(y_test_t, y_pred) ) 

metrics = {
    "Accuracy": np.array(results['test_accuracy']),
#     "Precision": np.array(results['test_precision']).mean(),
#     "Recall": np.array(results['test_recall']).mean(),
    "F1 Score":  np.array(results['test_f1']),
    }

In [16]:
confusion_matrix(y_test_t, y_pred)

array([[278, 120],
       [140, 167]])

In [17]:
metrics

{'Accuracy': array([0.63120567]), 'F1 Score': array([0.56228956])}

# Test using CV

In [18]:
results = {
    'test_accuracy' : [],
    'test_precision' : [],
    'test_recall' : [],
    'test_f1' : []
}


for j, (train_index, val_index, test_index) in enumerate(cv_fold_i):
    clf_lr_1 = LogisticRegression(max_iter=5000, C=1, penalty='l2', solver='liblinear')

    y_train_t = df['assestment'].values[train_index]
    X_train_t = np.array(emb_style)[train_index,:]
    y_test_t = df['assestment'].values[test_index]
    X_test_t = np.array(emb_style)[test_index,:]
    
    clf_lr_1.fit(X_train_t, y_train_t)

    y_pred = clf_lr_1.predict(X_test_t)

    results['test_accuracy'].append( accuracy_score(y_test_t, y_pred) ) 
#     results['test_precision'].append( precision_score(y_test_t, y_pred) ) 
#     results['test_recall'].append( recall_score(y_test_t, y_pred) ) 
    results['test_f1'].append( f1_score(y_test_t, y_pred) ) 


out = {
    "Accuracy": np.array(results['test_accuracy']),
#     "Precision": np.array(results['test_precision']).mean(),
#     "Recall": np.array(results['test_recall']).mean(),
    "F1 Score":  np.array(results['test_f1']),
    }

print(
    'triplet loss lr C1',
    f'Accuracy {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f}',
    f'F1 Score {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}',
    f' {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f} | {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}'
)    

triplet loss lr C1 Accuracy 0.635+-0.020 F1 Score 0.597+-0.042  0.635+-0.020 | 0.597+-0.042


### Get use best models

In [19]:
results = {
    'test_accuracy' : [],
    'test_precision' : [],
    'test_recall' : [],
    'test_f1' : []
}


for j, (train_index, val_index, test_index) in enumerate(cv_Kfold_i):
    clf_lr_1 = LogisticRegression(max_iter=5000, C=1, penalty='l2', solver='liblinear')

    y_train_t = df['assestment'].values[train_index]
    X_train_t = np.array(emb_style)[train_index,:]
    y_test_t = df['assestment'].values[test_index]
    X_test_t = np.array(emb_style)[test_index,:]
    
    clf_lr_1.fit(X_train_t, y_train_t)

    y_pred = clf_lr_1.predict(X_test_t)

    results['test_accuracy'].append( accuracy_score(y_test_t, y_pred) ) 
#     results['test_precision'].append( precision_score(y_test_t, y_pred) ) 
#     results['test_recall'].append( recall_score(y_test_t, y_pred) ) 
    results['test_f1'].append( f1_score(y_test_t, y_pred) ) 


out = {
    "Accuracy": np.array(results['test_accuracy']),
#     "Precision": np.array(results['test_precision']).mean(),
#     "Recall": np.array(results['test_recall']).mean(),
    "F1 Score":  np.array(results['test_f1']),
    }

print(
    'triplet loss lr C1',
    f'Accuracy {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f}',
    f'F1 Score {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}',
    f' {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f} | {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}'
)    

triplet loss lr C1 Accuracy 0.636+-0.018 F1 Score 0.606+-0.019  0.636+-0.018 | 0.606+-0.019
