In [1]:
import torch.optim as optim
import gensim
import nltk
from nltk.tokenize import word_tokenize
from modules.preprocess import *
from modules.utils import build_dataset, text_to_word2vec, evaluate
from modules.rnn_model import TextRNN
import gensim.downloader as api
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import tensorflow as tf

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2024-03-30 14:08:51.583048: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from keras.layers import Flatten, Dense, Dropout, Flatten, Lambda, Input, Conv1D, AveragePooling1D, MaxPooling1D
from keras.optimizers import RMSprop
from keras.models import Model

In [3]:
dataset = build_dataset(path="lapresse_crawler", rnd_state=10)

  return df.T.to_dict()


In [4]:
len(dataset)

4088

In [4]:
dataset = text_edit(dataset, grp_num=False, rm_newline=True, rm_punctuation=True,
              rm_stop_words=False, lowercase=True, lemmatize=False, html_=True, expand=True)

100%|██████████| 4054/4054 [00:02<00:00, 1776.24it/s]


In [5]:
X = [x['text'] for x in dataset.values() if x['section_1'] in ['actualites', 'sports', 'affaires', 'arts', 'international']]
Y = [x['section_label'] for x in dataset.values() if x['section_1'] in ['actualites', 'sports', 'affaires', 'arts', 'international']]

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)

In [7]:
nltk.download('punkt')
model_name = 'fasttext-wiki-news-subwords-300'  
word2vec_model = api.load(model_name)
text = "Ceci est un texte exemple"
vector = text_to_word2vec(text, word2vec_model)

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
X_train = torch.stack([torch.tensor(text_to_word2vec(x, word2vec_model), dtype=torch.float32).view(-1,1) for x in X_train], dim=0)
X_test = torch.stack([torch.tensor(text_to_word2vec(x, word2vec_model), dtype=torch.float32).view(-1,1) for x in X_test], dim=0)
Y_train = torch.tensor(Y_train, dtype=torch.long)
Y_test = torch.tensor(Y_test, dtype=torch.long)

In [9]:
def zip_set(X, Y, num_pairs):
    zipped_list = list(zip(X, Y))
    random.shuffle(zipped_list)  # Shuffle the zipped_list
    
    pairs = []
    labels = []

    for i in range(0, len(zipped_list), 2):
        sample1, sample2 = zipped_list[i], zipped_list[i+1]
        pairs.append([sample1[0], sample2[0]])
        if sample1[1] == sample2[1]:
            labels.append(1)
        else:
            labels.append(0) 
    pairs = np.array(pairs)
    labels = np.array(labels)
    return pairs, labels

In [10]:
Y_train.shape

torch.Size([3243])

In [11]:
train_pairs, train_labels = zip_set(X_train, Y_train, num_pairs=10000)

In [12]:
test_pairs, test_labels = zip_set(X_test, Y_test, num_pairs=1000)

In [13]:
def create_base_net_1D(input_shape):
    input = Input(shape=input_shape)
    
    x = Conv1D(32, 3, activation='relu')(input)  
    x = AveragePooling1D(pool_size=2)(x)
    x = Conv1D(64, 3, activation='tanh')(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.25)(x)
    x = Flatten()(x)
    x = Dense(128, activation='tanh')(x)
    x = Dropout(0.5)(x)
    x = Dense(64, activation='tanh')(x)
    x = Dropout(0.5)(x)
    x = Dense(10, activation='tanh')(x)
    model = Model(inputs=input, outputs=x)
    
    model.summary()
    
    return model

In [14]:
base_network  = create_base_net_1D((300,1))

In [15]:
def euclid_dis(vects):
    x, y = vects
    sum_square = tf.reduce_sum(tf.square(x - y), axis=1, keepdims=True)
    return tf.sqrt(tf.maximum(sum_square, tf.keras.backend.epsilon()))

In [16]:
def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

In [17]:
def contrastive_loss(y_true, y_pred):
    margin = 1
    square_pred = tf.square(y_pred)
    margin_square = tf.square(tf.maximum(margin - y_pred, 0))
    return tf.reduce_mean(y_true * square_pred + (1 - y_true) * margin_square)

In [18]:
def compute_accuracy(y_true, y_pred):
    pred = y_pred.ravel() < 0.5
    return np.mean(pred == y_true)

In [19]:
def accuracy(y_true, y_pred):
    pred = tf.cast(y_pred < 0.5, y_true.dtype)
    return tf.reduce_mean(tf.cast(tf.equal(y_true, pred), tf.float32))

In [20]:
input_a = Input(shape=(300,1))
input_b = Input(shape=(300,1))

processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(euclid_dis,output_shape=eucl_dist_output_shape)([processed_a, processed_b])

model = Model([input_a, input_b], distance)

In [21]:
epochs=10

In [22]:
rms = RMSprop()
model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
model.fit([train_pairs[:, 0], train_pairs[:, 1]], train_labels,
          batch_size=8,
          epochs=epochs,
          validation_data=([test_pairs[:, 0], test_pairs[:, 1]], test_labels))

Epoch 1/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 8ms/step - accuracy: 0.7131 - loss: 0.2304 - val_accuracy: 0.2230 - val_loss: 0.7746
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.7439 - loss: 0.2145 - val_accuracy: 0.2230 - val_loss: 0.7765
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.7385 - loss: 0.2178 - val_accuracy: 0.2230 - val_loss: 0.7765
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.7437 - loss: 0.2082 - val_accuracy: 0.2230 - val_loss: 0.7763
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.7518 - loss: 0.2059 - val_accuracy: 0.2230 - val_loss: 0.7765
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.7378 - loss: 0.2156 - val_accuracy: 0.2230 - val_loss: 0.7765
Epoch 7/10
[

<keras.src.callbacks.history.History at 0x7f3ef8d5fe20>