In [1]:
import torch.optim as optim
import gensim
import nltk
from nltk.tokenize import word_tokenize
from modules.preprocess import *
from modules.utils import build_dataset, text_to_word2vec, evaluate
from modules.rnn_model import TextRNN
import gensim.downloader as api
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import tensorflow as tf
import os
from config import *

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2024-03-30 16:53:36.654302: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from keras.layers import Flatten, Dense, Dropout, Flatten, Lambda, Input, Conv1D, AveragePooling1D, MaxPooling1D
from keras.optimizers import RMSprop
from keras.models import Model

In [3]:
dataset = build_dataset(path="lapresse_crawler", rnd_state=10)

In [4]:
dataset = text_edit(dataset, grp_num=False, rm_newline=True, rm_punctuation=True,
              rm_stop_words=False, lowercase=True, lemmatize=False, html_=True, expand=True)

100%|██████████| 100/100 [00:00<00:00, 1518.55it/s]


In [5]:
X = [x['text'] for x in dataset.values() if x['section_1'] in ['actualites', 'sports', 'affaires', 'arts', 'international']]
Y = [x['section_label'] for x in dataset.values() if x['section_1'] in ['actualites', 'sports', 'affaires', 'arts', 'international']]

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)

In [7]:
W2V_MODEL_FILE_NAME = "word2vec.model"

def download_w2v_model():
    model_name = 'fasttext-wiki-news-subwords-300'
    word2vec_model = api.load(model_name)
    word2vec_model.save(W2V_MODEL_FILE_NAME)
    return word2vec_model

word2vec_model = gensim.models.KeyedVectors.load(W2V_MODEL_FILE_NAME) if os.path.exists(W2V_MODEL_FILE_NAME) else download_w2v_model()

In [8]:
X_train = torch.stack([torch.tensor(text_to_word2vec(x, word2vec_model), dtype=torch.float32) for x in X_train], dim=0)
X_test = torch.stack([torch.tensor(text_to_word2vec(x, word2vec_model), dtype=torch.float32) for x in X_test], dim=0)
Y_train = torch.tensor(Y_train, dtype=torch.long)
Y_test = torch.tensor(Y_test, dtype=torch.long)

In [9]:
def zip_set(X, Y):
    zipped_list = list(zip(X, Y))
    random.shuffle(zipped_list)  # Shuffle the zipped_list
    
    pairs = []
    labels = []

    for i in range(0, len(zipped_list), 2):
        sample1, sample2 = zipped_list[i], zipped_list[i+1]
        pairs.append([sample1[0], sample2[0]])
        if sample1[1] == sample2[1]:
            labels.append(1)
        else:
            labels.append(0) 
    pairs = np.array(pairs)
    labels = np.array(labels)
    return pairs, labels

In [10]:
train_pairs, train_labels = zip_set(X_train, Y_train)

In [11]:
test_pairs, test_labels = zip_set(X_test, Y_test)

In [12]:
def create_base_net_1D(input_shape):
    input = Input(shape=input_shape)
    
    x = Conv1D(32, 3, activation='relu')(input)  
    x = AveragePooling1D(pool_size=2)(x)
    x = Conv1D(64, 3, activation='tanh')(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.25)(x)
    x = Flatten()(x)
    x = Dense(128, activation='tanh')(x)
    x = Dropout(0.5)(x)
    x = Dense(64, activation='tanh')(x)
    x = Dropout(0.5)(x)
    x = Dense(10, activation='tanh')(x)
    model = Model(inputs=input, outputs=x)
    
    model.summary()
    
    return model

In [13]:
base_network  = create_base_net_1D((word2vec_model.vector_size,MAX_SAMPLE_LENGTH))

In [14]:
def euclid_dis(vects):
    x, y = vects
    sum_square = tf.reduce_sum(tf.square(x - y), axis=1, keepdims=True)
    return tf.sqrt(tf.maximum(sum_square, tf.keras.backend.epsilon()))

In [15]:
def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

In [16]:
def contrastive_loss(y_true, y_pred):
    margin = 1
    square_pred = tf.square(y_pred)
    margin_square = tf.square(tf.maximum(margin - y_pred, 0))
    return tf.reduce_mean(y_true * square_pred + (1 - y_true) * margin_square)

In [17]:
def compute_accuracy(y_true, y_pred):
    pred = y_pred.ravel() < 0.5
    return np.mean(pred == y_true)

In [18]:
def accuracy(y_true, y_pred):
    pred = tf.cast(y_pred < 0.5, y_true.dtype)
    return tf.reduce_mean(tf.cast(tf.equal(y_true, pred), tf.float32))

In [19]:
input_a = Input(shape=(word2vec_model.vector_size,MAX_SAMPLE_LENGTH))
input_b = Input(shape=(word2vec_model.vector_size,MAX_SAMPLE_LENGTH))

processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(euclid_dis,output_shape=eucl_dist_output_shape)([processed_a, processed_b])

model = Model([input_a, input_b], distance)

In [20]:
rms = RMSprop()
model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
model.fit([train_pairs[:, 0], train_pairs[:, 1]], train_labels,
          batch_size=8,
          epochs=EPOCHS,
          validation_data=([test_pairs[:, 0], test_pairs[:, 1]], test_labels))

Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 127ms/step - accuracy: 0.6736 - loss: 0.5966 - val_accuracy: 0.5000 - val_loss: 0.4079
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.8566 - loss: 0.5932 - val_accuracy: 0.4375 - val_loss: 0.3610
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.8201 - loss: 0.4571 - val_accuracy: 0.4375 - val_loss: 0.4084
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.8010 - loss: 0.3541 - val_accuracy: 0.5000 - val_loss: 0.4203
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.8288 - loss: 0.3447 - val_accuracy: 0.5000 - val_loss: 0.4472
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.7993 - loss: 0.3072 - val_accuracy: 0.5625 - val_loss: 0.4162
Epoch 7/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7fdeddfb92a0>