In [1]:
import tensorflow as tf
import tensorflow_hub as hub

from sklearn.model_selection import train_test_split

import pandas as pd

In [2]:
#encoder = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
#encoder(['Hello World'])

In [3]:
model = tf.keras.models.Sequential()
model.add(hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder/4', 
                        input_shape=[], 
                        dtype=tf.string, 
                        trainable=True))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [4]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 512)               256797824 
_________________________________________________________________
dense (Dense)                (None, 1)                 513       
Total params: 256,798,337
Trainable params: 256,798,337
Non-trainable params: 0
_________________________________________________________________


In [5]:
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

In [6]:
df = pd.read_csv('./data/sentencedata.csv')

df.head()

Unnamed: 0,aid,sentence,sid,flag
0,111111112,US bloggers banned from entering UK,1,non-propaganda
1,111111112,Two prominent US bloggers have been banned fro...,3,non-propaganda
2,111111112,Pamela Geller and Robert Spencer co-founded an...,5,propaganda
3,111111112,They were due to speak at an English Defence L...,7,non-propaganda
4,111111112,A government spokesman said individuals whose ...,9,non-propaganda


In [7]:
df['flag'].value_counts()

non-propaganda    10320
propaganda         3938
Name: flag, dtype: int64

In [8]:
value_mapping = {'propaganda': 1, 'non-propaganda': 0}
df['label'] = df['flag'].map(value_mapping)
df = df.drop("aid", axis=1).drop("sid", axis=1).drop("flag", axis=1)
df.head()

Unnamed: 0,sentence,label
0,US bloggers banned from entering UK,0
1,Two prominent US bloggers have been banned fro...,0
2,Pamela Geller and Robert Spencer co-founded an...,1
3,They were due to speak at an English Defence L...,0
4,A government spokesman said individuals whose ...,0


In [9]:
x_train, x_test, y_train, y_test = train_test_split(df['sentence'], 
                                                    df['label'], 
                                                    test_size=0.25, 
                                                    stratify=df['label'])

In [10]:
model.fit(x_train, 
          y_train, 
          epochs=2, 
          validation_data=(x_test, y_test))

Train on 10693 samples, validate on 3565 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1fe2c38e1c8>

In [None]:
model.save("./model/google-universal-sentence-encoder-finetuned/")

In [12]:
from sklearn.metrics import f1_score, confusion_matrix

predictions = model.predict(x_test).round()
print('F1-score: {0}'.format(f1_score(predictions, y_test)))
confusion_matrix(predictions, y_test)

F1-score: 0.5880039331366765


array([[2129,  387],
       [ 451,  598]], dtype=int64)

In [34]:
sentences = [
    "In some cases the support is built based on preconceived judgements.",
    "We must stop those refugees as they are terrorists!",
    "It can refer to any person or concept with a negative connotation.",
    "Do you know who else was doing that ? Hitler!",
    "Only one kind of person can think in that way: a communist."
]
model.predict(sentences)

array([[0.04541249],
       [0.57772183],
       [0.29103968],
       [0.8125916 ],
       [0.11487419]], dtype=float32)

In [1]:
from tensorflow.keras.models import load_model

In [4]:
model = load_model("../NotInfo.API/models/google-universal-sentence-encoder-finetuned")