In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import gensim
import nltk
from nltk.tokenize import word_tokenize
from modules.preprocess import *
from modules.utils import build_dataset, text_to_word2vec, evaluate
from modules.rnn_model import TextRNN
import gensim.downloader as api
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import tensorflow as tf

In [None]:
from keras.layers import Flatten, Dense, Dropout, Flatten, Lambda, Input, Conv1D, AveragePooling1D, MaxPooling1D
from keras.optimizers import RMSprop
from keras.models import Model

In [None]:
dataset = build_dataset(path="lapresse_crawler", num_samples=100, rnd_state=10)

In [None]:
dataset = text_edit(dataset, grp_num=False, rm_newline=True, rm_punctuation=True,
              rm_stop_words=False, lowercase=True, lemmatize=False, html_=True, convert_entities=False, expand=True)

In [None]:
X = [x['text'] for x in dataset.values() if x['section_1'] in ['actualites', 'sports', 'affaires', 'arts', 'international']]
Y = [x['section_label'] for x in dataset.values() if x['section_1'] in ['actualites', 'sports', 'affaires', 'arts', 'international']]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)

In [None]:
model_name = 'fasttext-wiki-news-subwords-300'  
word2vec_model = api.load(model_name)
text = "Ceci est un texte exemple"
vector = text_to_word2vec(text, word2vec_model)

In [None]:
X_train = torch.stack([torch.tensor(text_to_word2vec(x, word2vec_model), dtype=torch.float32).view(-1,1) for x in X_train], dim=0)
X_test = torch.stack([torch.tensor(text_to_word2vec(x, word2vec_model), dtype=torch.float32).view(-1,1) for x in X_test], dim=0)
Y_train = torch.tensor(Y_train, dtype=torch.long)
Y_test = torch.tensor(Y_test, dtype=torch.long)

In [None]:
zipped_list = list(zip(X_train, Y_train))
pairs = []
labels = []
num_pairs = 10
for _ in range(num_pairs):
    sample1, sample2 = random.sample(zipped_list, 2)
    pairs.append([sample1[0], sample2[0]])
    if sample1[1] == sample2[1]:
        labels.append(1)
    else:
        labels.append(0) 
pairs = np.array(pairs)
labels = np.array(labels)

In [None]:
def create_base_net_1D(input_shape):
    input = Input(shape=input_shape)
    
    x = Conv1D(32, 3, activation='relu')(input)  
    x = AveragePooling1D(pool_size=2)(x)
    x = Conv1D(64, 3, activation='tanh')(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.25)(x)
    x = Flatten()(x)
    x = Dense(128, activation='tanh')(x)
    x = Dropout(0.5)(x)
    x = Dense(64, activation='tanh')(x)
    x = Dropout(0.5)(x)
    x = Dense(10, activation='tanh')(x)
    model = Model(inputs=input, outputs=x)
    
    model.summary()
    
    return model

In [None]:
base_network  = create_base_net_1D((300,1))

In [None]:
def euclid_dis(vects):
    x, y = vects
    sum_square = tf.reduce_sum(tf.square(x - y), axis=1, keepdims=True)
    return tf.sqrt(tf.maximum(sum_square, tf.keras.backend.epsilon()))

In [None]:
def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

In [None]:
def contrastive_loss(y_true, y_pred):
    margin = 1
    square_pred = tf.square(y_pred)
    margin_square = tf.square(tf.maximum(margin - y_pred, 0))
    return tf.reduce_mean(y_true * square_pred + (1 - y_true) * margin_square)

In [None]:
def compute_accuracy(y_true, y_pred):
    pred = y_pred.ravel() < 0.5
    return np.mean(pred == y_true)

In [None]:
def accuracy(y_true, y_pred):
    pred = tf.cast(y_pred < 0.5, y_true.dtype)
    return tf.reduce_mean(tf.cast(tf.equal(y_true, pred), tf.float32))

In [None]:
input_a = Input(shape=(300,1))
input_b = Input(shape=(300,1))

processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(euclid_dis,output_shape=eucl_dist_output_shape)([processed_a, processed_b])

model = Model([input_a, input_b], distance)

In [None]:
rms = RMSprop()
model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
model.fit([pairs[:, 0], pairs[:, 1]], labels,
          batch_size=2,
          epochs=epochs,
          validation_data=([pairs[:, 0], pairs[:, 1]], labels))