In [14]:
# import time
# import torch
import random
import numpy as np
import pandas as pd
# import torch.nn as nn
# import torch.optim as optim
# from tqdm.notebook import tqdm
# import matplotlib.pyplot as plt

# from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.base import clone as sklearn_clone

from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    recall_score,
    precision_score,
    roc_auc_score, confusion_matrix, roc_curve, classification_report
)

# from sentence_transformers import SentenceTransformer
# from transformers import AutoTokenizer, AutoModel, HerbertTokenizer, BatchEncoding

# import gc
# from sklearn.neighbors import KNeighborsClassifier

# import scipy
# from scipy import spatial

# import umap.umap_ as umap

import spacy
import stylo_metrix

nlp = spacy.load('pl_nask_large')         # for Polish
nlp.add_pipe("stylo_metrix")

<stylo_metrix.pipeline.stylo_metrix_pipe.StyloMetrixPipe at 0x7f846d33cbe0>

In [15]:
np.random.seed(111)
random.seed(111)

## Get data

In [16]:
df_topics = pd.read_csv('../datasets/ready2use/topics.csv', index_col=0)

In [17]:
df = pd.read_csv('../datasets/ready2use/fake_news_features_combined.csv', sep=';')

df = df[ df['assestment'] != 'brak' ]

df.loc[:, 'assestment'] = df['assestment'].replace({
    'falsz' : 'Fałsz',
    'zbity_zegar' : 'Fałsz',
    'raczej_falsz' : 'Fałsz',
    'prawda' : 'Prawda',
    'blisko_prawdy' : 'Prawda',
    'polprawda' : 'Manipulacja',
    'Częściowy fałsz' : 'Manipulacja'
})

df = df[ df['assestment'] != 'Nieweryfikowalne' ]
df = df[ df['assestment'] != 'Manipulacja' ]

df['assestment'] = df['assestment'].replace({
    'Fałsz' : 0,
#     'Manipulacja' : 1,
    'Prawda' : 1
}).astype(int)

df = df.copy()[['assestment', 'text_clean']][df.index.isin(df_topics.index)].reset_index(drop=True)

## Kfold

In [19]:
cv_fold = []
cv_fold_i = []

for i in df_topics['topic'].unique().reshape(10,-1):
    train_cv = df_topics.index[ ~np.isin(df_topics["topic"], [i, np.mod(i+1,10)]) ].values
    val_cv = df_topics.index[ ~np.isin(df_topics["topic"], np.mod(i+1,10)) ].values
    test_cv = df_topics.index[ np.isin(df_topics["topic"], i) ].values
    
    train_cv_i = df_topics.reset_index().index[ ~np.isin(df_topics["topic"], [i, np.mod(i+1,10)]) ].values
    val_cv_i = df_topics.reset_index().index[ ~np.isin(df_topics["topic"], np.mod(i+1,10)) ].values
    test_cv_i = df_topics.reset_index().index[ np.isin(df_topics["topic"], i) ].values
    
    cv_fold.append( [train_cv, val_cv, test_cv])
    cv_fold_i.append( [train_cv_i, val_cv_i, test_cv_i])

In [20]:
kf = KFold(n_splits=10, shuffle=True)
kf.get_n_splits(df_topics)

cv_Kfold = []
cv_Kfold_i = []

for train_index, test_index in kf.split(df_topics):
    train_index, val_index = train_test_split(train_index, test_size=1/9, shuffle=True)
    train_cv = df_topics.iloc[ train_index, : ].index.values
    val_cv = df_topics.iloc[ val_index, : ].index.values
    test_cv = df_topics.iloc[ test_index, : ].index.values

    train_cv_i= df_topics.reset_index().iloc[ train_index, : ].index.values
    val_cv_i = df_topics.reset_index().iloc[ val_index, : ].index.values
    test_cv_i = df_topics.reset_index().iloc[ test_index, : ].index.values
    
    cv_Kfold.append( [train_cv, val_cv, test_cv])
    cv_Kfold_i.append( [train_cv_i, val_cv_i, test_cv_i])

## Train

In [58]:
txt = df['text_clean'].values[0]

In [59]:
def get_vec_stylo(txt):
    doc = nlp(txt)
    vec = []
    for metric in doc._.smv:
        vec.append(metric['value'])
    return vec

In [69]:
emb_style = [get_vec_stylo(t) for t in tqdm(df['text_clean'].values, position=0, leave=True)]

100%|███████████████████████████████████████| 6541/6541 [29:08<00:00,  3.74it/s]


In [83]:
np.array(emb_style).shape

(6541, 89)

In [73]:
with open('../datasets/ready2use/style_emb_pl.npy', 'wb') as f:
    np.save(f, np.array(emb_style) )

In [74]:
clf_lr_1 = LogisticRegression(max_iter=5000, C=1, penalty='l2', solver='liblinear')

In [75]:
train_index, val_index, test_index = cv_fold_i[np.random.randint(10)]

In [80]:
results = {
    'test_accuracy' : [],
    'test_precision' : [],
    'test_recall' : [],
    'test_f1' : []
}


y_train_t = df['assestment'].values[train_index]
X_train_t = np.array(emb_style)[train_index,:]
y_test_t = df['assestment'].values[train_index]
X_test_t = np.array(emb_style)[train_index,:]

clf_lr_1.fit(X_train_t, y_train_t)

y_pred = clf_lr_1.predict(X_test_t)

results['test_accuracy'].append( accuracy_score(y_test_t, y_pred) ) 
results['test_precision'].append( precision_score(y_test_t, y_pred) ) 
results['test_recall'].append( recall_score(y_test_t, y_pred) ) 
results['test_f1'].append( f1_score(y_test_t, y_pred) ) 

metrics = {
    "Accuracy": np.array(results['test_accuracy']),
#     "Precision": np.array(results['test_precision']).mean(),
#     "Recall": np.array(results['test_recall']).mean(),
    "F1 Score":  np.array(results['test_f1']),
    }

In [81]:
confusion_matrix(y_test_t, y_pred)

array([[1825,  909],
       [ 952, 1595]])

In [82]:
metrics

{'Accuracy': array([0.64760462]), 'F1 Score': array([0.63155811])}

# Test using CV

In [None]:
results = {
    'test_accuracy' : [],
    'test_precision' : [],
    'test_recall' : [],
    'test_f1' : []
}

embedding_dims = 100
batch_size = 128
epochs = 1000
lr = 0.001

for j, (train_index, val_index, test_index) in enumerate(cv_fold_i):
    train_ds = FakeNews(embeddings_table, df['assestment'].values, train_index)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
    
    val_ds = FakeNews(embeddings_table, df['assestment'].values, val_index)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=True, num_workers=2)

    test_ds = FakeNews(embeddings_table, df['assestment'].values, test_index)
    test_loader = DataLoader(test_ds, batch_size=batch_size//2, shuffle=False, num_workers=2)

    
    model = Network(embedding_dims)
    model = model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = TripletLoss()

    val_prev = np.inf
    
    model.train()
    for epoch in tqdm(range(epochs), desc=f"Epochs {j}"):
        running_loss = []
        for step, (anchor_claim, positive_claim, negative_claim, anchor_label) in enumerate(train_loader):
            anchor_claim = anchor_claim.to(device)
            positive_claim = positive_claim.to(device)
            negative_claim = negative_claim.to(device)

            optimizer.zero_grad()
            anchor_out = model(anchor_claim)
            positive_out = model(positive_claim)
            negative_out = model(negative_claim)

            loss = criterion(anchor_out, positive_out, negative_out)
            loss.backward()
            optimizer.step()

            running_loss.append(loss.cpu().detach().numpy())

        model.eval()
    
        val_loss = []
        for anchor_claim, positive_claim, negative_claim, _ in val_loader:
            anchor_claim = anchor_claim.to(device)
            positive_claim = positive_claim.to(device)
            negative_claim = negative_claim.to(device)

            anchor_out = model(anchor_claim)
            positive_out = model(positive_claim)
            negative_out = model(negative_claim)

            loss = criterion(anchor_out, positive_out, negative_out)
            val_loss.append(loss.cpu().detach().numpy())

        model.train()
        
        if np.mean(val_loss) < val_prev:
            val_prev = np.mean(val_loss)
            torch.save(model, f'model_cv{j}.pt')
        
        if epoch%100 == 0:
            print(f"{j} Epoch: {epoch+1}/{epochs} - Train Loss: {np.mean(running_loss):.4f};",
                  f" Val Loss: {np.mean(val_loss):.4f} Val loss best {val_prev:.4f}"
             )
            
    
    train_results = []
    labels = []

    model.eval()
    with torch.no_grad():
        for claim, _, _, label in train_loader:
            anchor_claim = claim.to(device)

            train_results.append(model(anchor_claim).cpu().numpy())
            labels.append(label)


    train_results = np.concatenate(train_results) 
    labels = np.concatenate(labels)

    
    test_results = []
    test_labels = []

    model.eval()
    with torch.no_grad():
        for claim, _, _, label in test_loader:
            anchor_claim = claim.to(device)

            test_results.append(model(anchor_claim).cpu().numpy())
            test_labels.append(label)


    test_results = np.concatenate(test_results)
    test_labels = np.concatenate(test_labels)

    
    clf_lr_1 = LogisticRegression(max_iter=5000, C=1, penalty='l2', solver='liblinear')

    y_train_t = labels
    X_train_t = train_results
    y_test_t = test_labels
    X_test_t = test_results

    clf_lr_1.fit(X_train_t, y_train_t)

    y_pred = clf_lr_1.predict(X_test_t)

    results['test_accuracy'].append( accuracy_score(y_test_t, y_pred) ) 
    results['test_precision'].append( precision_score(y_test_t, y_pred) ) 
    results['test_recall'].append( recall_score(y_test_t, y_pred) ) 
    results['test_f1'].append( f1_score(y_test_t, y_pred) ) 


out = {
    "Accuracy": np.array(results['test_accuracy']),
#     "Precision": np.array(results['test_precision']).mean(),
#     "Recall": np.array(results['test_recall']).mean(),
    "F1 Score":  np.array(results['test_f1']),
    }

print(
    'triplet loss lr C1',
    f'Accuracy {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f}',
    f'F1 Score {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}',
    f' {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f} | {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}'
)    

### Get use best models

In [None]:
results = {
    'test_accuracy' : [],
    'test_precision' : [],
    'test_recall' : [],
    'test_f1' : []
}


for j, (train_index, val_index, test_index) in enumerate(cv_fold_i):
    train_ds = FakeNews(embeddings_table, df['assestment'].values, train_index)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
    
    val_ds = FakeNews(embeddings_table, df['assestment'].values, val_index)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=True, num_workers=2)

    test_ds = FakeNews(embeddings_table, df['assestment'].values, test_index)
    test_loader = DataLoader(test_ds, batch_size=batch_size//2, shuffle=False, num_workers=2)

    
    model = Network(embedding_dims)
    model = torch.load(f'model_cv{j}.pt')
    model = model.to(device)

    model.eval()
   
    train_results = []
    labels = []

    model.eval()
    with torch.no_grad():
        for claim, _, _, label in train_loader:
            anchor_claim = claim.to(device)

            train_results.append(model(anchor_claim).cpu().numpy())
            labels.append(label)


    train_results = np.concatenate(train_results) 
    labels = np.concatenate(labels)

    
    test_results = []
    test_labels = []

    model.eval()
    with torch.no_grad():
        for claim, _, _, label in test_loader:
            anchor_claim = claim.to(device)

            test_results.append(model(anchor_claim).cpu().numpy())
            test_labels.append(label)


    test_results = np.concatenate(test_results)
    test_labels = np.concatenate(test_labels)

    
    clf_lr_1 = LogisticRegression(max_iter=5000, C=1, penalty='l2', solver='liblinear')

    y_train_t = labels
    X_train_t = train_results
    y_test_t = test_labels
    X_test_t = test_results

    clf_lr_1.fit(X_train_t, y_train_t)

    y_pred = clf_lr_1.predict(X_test_t)

    results['test_accuracy'].append( accuracy_score(y_test_t, y_pred) ) 
    results['test_precision'].append( precision_score(y_test_t, y_pred) ) 
    results['test_recall'].append( recall_score(y_test_t, y_pred) ) 
    results['test_f1'].append( f1_score(y_test_t, y_pred) ) 


out = {
    "Accuracy": np.array(results['test_accuracy']),
#     "Precision": np.array(results['test_precision']).mean(),
#     "Recall": np.array(results['test_recall']).mean(),
    "F1 Score":  np.array(results['test_f1']),
    }

print(
    'triplet loss lr C1',
    f'Accuracy {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f}',
    f'F1 Score {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}',
    f' {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f} | {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}'
)    

In [None]:
results = {
    'test_accuracy' : [],
    'test_precision' : [],
    'test_recall' : [],
    'test_f1' : []
}

embedding_dims = 100
batch_size = 128
epochs = 1000
lr = 0.001

for j, (train_index, val_index, test_index) in enumerate(cv_Kfold_i):
    train_ds = FakeNews(embeddings_table, df['assestment'].values, train_index)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
    
    val_ds = FakeNews(embeddings_table, df['assestment'].values, val_index)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=True, num_workers=2)

    test_ds = FakeNews(embeddings_table, df['assestment'].values, test_index)
    test_loader = DataLoader(test_ds, batch_size=batch_size//2, shuffle=False, num_workers=2)

    
    model = Network(embedding_dims)
    model = model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = TripletLoss()

    val_prev = np.inf
    
    model.train()
    for epoch in tqdm(range(epochs), desc=f"Epochs {j}"):
        running_loss = []
        for step, (anchor_claim, positive_claim, negative_claim, anchor_label) in enumerate(train_loader):
            anchor_claim = anchor_claim.to(device)
            positive_claim = positive_claim.to(device)
            negative_claim = negative_claim.to(device)

            optimizer.zero_grad()
            anchor_out = model(anchor_claim)
            positive_out = model(positive_claim)
            negative_out = model(negative_claim)

            loss = criterion(anchor_out, positive_out, negative_out)
            loss.backward()
            optimizer.step()

            running_loss.append(loss.cpu().detach().numpy())

        model.eval()
    
        val_loss = []
        for anchor_claim, positive_claim, negative_claim, _ in val_loader:
            anchor_claim = anchor_claim.to(device)
            positive_claim = positive_claim.to(device)
            negative_claim = negative_claim.to(device)

            anchor_out = model(anchor_claim)
            positive_out = model(positive_claim)
            negative_out = model(negative_claim)

            loss = criterion(anchor_out, positive_out, negative_out)
            val_loss.append(loss.cpu().detach().numpy())

        model.train()
        
        if np.mean(val_loss) < val_prev:
            val_prev = np.mean(val_loss)
            torch.save(model, f'model_cv{j}.pt')
        
        if epoch%100 == 0:
            print(f"{j} Epoch: {epoch+1}/{epochs} - Train Loss: {np.mean(running_loss):.4f};",
                  f" Val Loss: {np.mean(val_loss):.4f} Val loss best {val_prev:.4f}"
             )
            
    
    train_results = []
    labels = []

    model.eval()
    with torch.no_grad():
        for claim, _, _, label in train_loader:
            anchor_claim = claim.to(device)

            train_results.append(model(anchor_claim).cpu().numpy())
            labels.append(label)


    train_results = np.concatenate(train_results) 
    labels = np.concatenate(labels)

    
    test_results = []
    test_labels = []

    model.eval()
    with torch.no_grad():
        for claim, _, _, label in test_loader:
            anchor_claim = claim.to(device)

            test_results.append(model(anchor_claim).cpu().numpy())
            test_labels.append(label)


    test_results = np.concatenate(test_results)
    test_labels = np.concatenate(test_labels)

    
    clf_lr_1 = LogisticRegression(max_iter=5000, C=1, penalty='l2', solver='liblinear')

    y_train_t = labels
    X_train_t = train_results
    y_test_t = test_labels
    X_test_t = test_results

    clf_lr_1.fit(X_train_t, y_train_t)

    y_pred = clf_lr_1.predict(X_test_t)

    results['test_accuracy'].append( accuracy_score(y_test_t, y_pred) ) 
    results['test_precision'].append( precision_score(y_test_t, y_pred) ) 
    results['test_recall'].append( recall_score(y_test_t, y_pred) ) 
    results['test_f1'].append( f1_score(y_test_t, y_pred) ) 


out = {
    "Accuracy": np.array(results['test_accuracy']),
#     "Precision": np.array(results['test_precision']).mean(),
#     "Recall": np.array(results['test_recall']).mean(),
    "F1 Score":  np.array(results['test_f1']),
    }

print(
    'triplet loss lr C1',
    f'Accuracy {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f}',
    f'F1 Score {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}',
    f' {out["Accuracy"].mean():.3f}+-{out["Accuracy"].std():.3f} | {out["F1 Score"].mean():.3f}+-{out["F1 Score"].std():.3f}'
)    