In [1]:
# based on this post
# https://medium.com/swlh/few-shot-learning-in-nlp-use-siamese-networks-189de22459d0

In [2]:
import pandas as pd
import numpy as np

In [8]:
import tensorflow as tf
import tensorflow_hub as hub

localhost_save_option = tf.saved_model.SaveOptions(experimental_io_device="/job:localhost")

module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3'
# Import the Universal Sentence Encoder's TF Hub module
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

FileNotFoundError: Op type not registered 'SentencepieceOp' in binary running on marek-ROG-Zephyrus-G14. Make sure the Op and Kernel are registered in the binary running in this process. Note that if you are loading a saved graph which used ops from tf.contrib, accessing (e.g.) `tf.contrib.resampler` should be done before importing the graph, as contrib ops are lazily registered when the module is first accessed.
 You may be trying to load on a different device from the computational device. Consider setting the `experimental_io_device` option in `tf.saved_model.LoadOptions` to the io_device such as '/job:localhost'.

## Load train data

In [8]:
df = pd.read_csv('../datasets/ready2use/fake_news_features_combined.csv', sep=';')

df = df[ df['assestment'] != 'brak' ]

df.loc[:, 'assestment'] = df['assestment'].replace({
    'falsz' : 'Fałsz',
    'zbity_zegar' : 'Fałsz',
    'raczej_falsz' : 'Fałsz',
    'prawda' : 'Prawda',
    'blisko_prawdy' : 'Prawda',
    'polprawda' : 'Manipulacja',
    'Częściowy fałsz' : 'Manipulacja'
})

df = df[ df['assestment'] != 'Nieweryfikowalne' ]
df = df[ df['assestment'] != 'Manipulacja' ]

df['assestment'] = df['assestment'].replace({
    'Fałsz' : 0,
#     'Manipulacja' : 1,
    'Prawda' : 1
}).astype(int)

y_train = df.copy()['assestment']
X_train = df.copy().loc[:, df.columns != 'assestment']

In [5]:
train = X_train.join(y_train)
train = train[['text_clean','assestment']]
train.columns = ['text','class']

### Add random Kfold

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split

train, test = train_test_split(train, test_size=0.1, random_state=111)

In [None]:
import keras
import keras.backend as K
from keras.layers import *
from keras.callbacks import *
from keras.optimizers import *
from keras import Model
from keras.layers.core import Lambda, Flatten, Dense
from keras.layers import Bidirectional, LSTM
import pickle    
import os

input_text1 = Input(shape=(512,))

x = Dense(256, activation='relu')(input_text1)
x = Dropout(0.4)(x)
x = BatchNormalization()(x)
x = Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001))(x)
x = Dropout(0.4)(x)

dense_layer = Dense(128, name='dense_layer')(x)
norm_layer = Lambda(lambda  x: K.l2_normalize(x, axis=1), name='norm_layer')(dense_layer)

model=Model(inputs=[input_text1], outputs=norm_layer)

model.summary()

In [None]:
# Input for anchor, positive and negative images
in_a = Input(shape=(512,))
in_p = Input(shape=(512,))
in_n = Input(shape=(512,))

# Output for anchor, positive and negative embedding vectors
# The nn4_small model instance is shared (Siamese network)
emb_a = model(in_a)
emb_p = model(in_p)
emb_n = model(in_n)

class TripletLossLayer(Layer):
    def __init__(self, alpha, **kwargs):
        self.alpha = alpha
        super(TripletLossLayer, self).__init__(**kwargs)
    
    def triplet_loss(self, inputs):
        a, p, n = inputs
        p_dist = K.sum(K.square(a-p), axis=-1)
        n_dist = K.sum(K.square(a-n), axis=-1)
        return K.sum(K.maximum(p_dist - n_dist + self.alpha, 0), axis=0)
    
    def call(self, inputs):
        loss = self.triplet_loss(inputs)
        self.add_loss(loss)
        return loss

# Layer that computes the triplet loss from anchor, positive and negative embedding vectors
triplet_loss_layer = TripletLossLayer(alpha=0.4, name='triplet_loss_layer')([emb_a, emb_p, emb_n])

# Model that can be trained with anchor, positive negative images
nn4_small2_train = Model([in_a, in_p, in_n], triplet_loss_layer)

In [None]:
#creating the necessary datastructures for selcting triplets
unique_train_label=np.array(train['class'].unique().tolist())
labels_train=np.array(train['class'].tolist())
map_train_label_indices = {label: np.flatnonzero(labels_train == label) for label in unique_train_label}

def get_triplets(unique_train_label,map_train_label_indices):
    label_l, label_r = np.random.choice(unique_train_label, 2, replace=False)
    a, p = np.random.choice(map_train_label_indices[label_l],2, replace=False)
    n = np.random.choice(map_train_label_indices[label_r])
    return a, p, n

def get_triplets_batch(k,train_set,unique_train_label,map_train_label_indices,embed):

    while True:
        idxs_a, idxs_p, idxs_n = [], [], []
        for _ in range(k):
            a, p, n = get_triplets(unique_train_label,map_train_label_indices)
            idxs_a.append(a)
            idxs_p.append(p)
            idxs_n.append(n)

        a=train_set.iloc[idxs_a].values.tolist()
        b=train_set.iloc[idxs_p].values.tolist()
        c=train_set.iloc[idxs_n].values.tolist()

        a = embed(a)
        p = embed(b)
        n = embed(c)
        # return train_set[idxs_a], train_set[idxs_p], train_set[idxs_n]
        yield [a,p,n], []

In [None]:
nn4_small2_train.compile(loss=None, optimizer='adam')
nn4_small2_train.fit(get_triplets_batch(128,train['text'],unique_train_label,map_train_label_indices,embed), epochs=100,steps_per_epoch=10)

In [None]:
#getting the embeddings from the model
X_train = model.predict(embed(np.array(train['text'].values.tolist())))
X_test = model.predict(embed(np.array(test['text'].values.tolist())))

y_train = np.array(train['class'].values.tolist())
y_test = np.array(test['class'].values.tolist())

from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
svc = LinearSVC()

knn.fit(X_train, y_train)
svc.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)
y_pred_svc = svc.predict(X_test)
acc_svc = accuracy_score(y_test, y_pred_svc)

print(f'KNN accuracy = {acc_knn}, SVM accuracy = {acc_svc}')

In [None]:
from sklearn import metrics
print(metrics.classification_report(list(y_test), list(y_pred_knn)))

In [None]:
from sklearn.manifold import TSNE

X_embedded = TSNE(n_components=2).fit_transform(X_test)

plt.figure(figsize=(10,10))

for i, t in enumerate(set(y_test)):
    idx = y_test == t
    plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=t)   

plt.legend(bbox_to_anchor=(1, 1));