# Load Models

In [4]:
!gdown --id 1Nrj3gi-ENBwsEqjsGB68pWGrQI8fFWC6

Downloading...
From: https://drive.google.com/uc?id=1Nrj3gi-ENBwsEqjsGB68pWGrQI8fFWC6
To: /content/DEMO.zip
100% 213M/213M [00:02<00:00, 86.0MB/s]


In [5]:
!unzip DEMO.zip

Archive:  DEMO.zip
   creating: DEMO/
  inflating: DEMO/fake_news_of_train.pkl  
  inflating: DEMO/kmeans_15.pkl      
  inflating: DEMO/kmeans_14.pkl      
  inflating: DEMO/kmeans_10.pkl      
  inflating: DEMO/kmeans_11.pkl      
  inflating: DEMO/kmeans_12.pkl      
  inflating: DEMO/kmeans_13.pkl      
  inflating: DEMO/Encoder_Bi-Vae-Density.zip  


In [6]:
!unzip DEMO/Encoder_Bi-Vae-Density.zip

Archive:  DEMO/Encoder_Bi-Vae-Density.zip
   creating: Encoder_Bi-Vae-Density/
  inflating: Encoder_Bi-Vae-Density/saved_model.pb  
  inflating: Encoder_Bi-Vae-Density/keras_metadata.pb  
   creating: Encoder_Bi-Vae-Density/assets/
   creating: Encoder_Bi-Vae-Density/variables/
  inflating: Encoder_Bi-Vae-Density/variables/variables.index  
  inflating: Encoder_Bi-Vae-Density/variables/variables.data-00000-of-00001  


## Load Bi-VAE-Density

In [7]:
import tensorflow as tf

bi_vae_density = tf.keras.models.load_model('Encoder_Bi-Vae-Density')



## Load Kmeans

In [8]:
import pickle

l_kmeans = []

for i in range(6):
  with open('DEMO/kmeans_1' + str(i) + '.pkl', 'rb') as file:
    l_kmeans.append(pickle.load(file))

## Load fake news of train

In [9]:
import pandas as pd
import numpy as np
df_fk_train = pd.read_pickle('DEMO/fake_news_of_train.pkl')
df_train = np.array(df_fk_train['DistilBERT Multilingua'].to_list())

# DBERTML Embeddings (USE GPU)

In [10]:
!pip install sentence-transformers==1.0.4 #version used in the fake news collections

Collecting sentence-transformers==1.0.4
  Downloading sentence-transformers-1.0.4.tar.gz (74 kB)
[K     |████████████████████████████████| 74 kB 2.7 MB/s 
[?25hCollecting transformers<5.0.0,>=3.1.0
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 30.1 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 39.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 440 kB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.1 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K  

In [11]:
from sentence_transformers import SentenceTransformer

In [12]:
def sentence_embedding(txts):

  model = SentenceTransformer('distiluse-base-multilingual-cased')

  sentences =[]

  for txt in txts:
    txt.replace('\\\\t', ' ')
    txt.replace('\\\\r', ' ')
    txt.replace('\\\\n',' ')
    txt.replace('\\t', ' ')
    txt.replace('\\r', ' ')
    txt.replace('\\n',' ')
    txt.replace('\t', ' ')
    txt.replace('\r', ' ')
    txt.replace('\n',' ')
    sentences.append(txt)

  sentence_embeddings = model.encode(sentences)

  return sentence_embeddings 

# Density Information

In [13]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import numpy as np 

def return_density_inf(l_kmeans, df_train, df_new):
    l_x_new = []

    len_train = len(df_train)
    len_new = len(df_new)

    for kmeans in l_kmeans:
        x_new_temp = []

        dfs = np.concatenate([df_train, df_new])
        labels = np.concatenate([kmeans.labels_, kmeans.predict(df_new.astype('float64'))])
          
        silho = silhouette_samples(dfs, labels).reshape(len_train + len_new, 1)
        l_x_new.append(silho[len_train:])

    return np.concatenate(l_x_new, axis=1)

# DEMO

In [73]:
inputs = [
          'The GOLD price was related to Brexit'
]

In [74]:
embeddings_DBERTML = sentence_embedding(inputs)

In [75]:
densities = return_density_inf(l_kmeans, df_train, np.array(embeddings_DBERTML))

In [76]:
embeddings_BiVae, _, _ = bi_vae_density.predict([embeddings_DBERTML,densities]) 

# Top 5 texts most similars

In [21]:
densities_train = return_density_inf(l_kmeans, df_train, df_train)
embeddings_train, _, _ = bi_vae_density.predict([df_train,densities_train]) 

In [32]:
df_bests = df_fk_train
df_bests['Embeddings_BiVAE_D'] = list(embeddings_train)

In [77]:
from scipy.spatial import distance

In [78]:
embedding_top10 = embeddings_BiVae[0]

In [79]:
l_dist = []
for item,row in df_bests.iterrows():
  l_dist.append(distance.cosine(embedding_top10, row['Embeddings_BiVAE_D']))

df_bests['cosine_distance'] = l_dist

In [80]:
df_bests.sort_values(by='cosine_distance')[['text','class','cosine_distance']].head(5)

Unnamed: 0,text,class,cosine_distance
177,GOLD PRICES hit new 17-month highs against the...,1,0.333811
2041,GOLD PRICES edged higher against a falling US ...,1,0.401644
1774,While key Western banks are artificially restr...,1,0.414491
570,"GOLD PRICES for Euro, UK and Japanese investor...",1,0.421811
658,Cameron Previous Labour Administration To Bla...,1,0.429223
