In [1]:
import config
from dataset import CustomDataset, CustomDatasetSiamese, train_test_split, undersampling
from model import create_model, create_siamese_model

import os
import math
import tensorflow as tf
import numpy as np
import pandas as pd

os.environ['CUDA_VISIBLE_DEVICES'] = config.CUDA_VISIBLE_DEVICES    # specify GPU usage    



## Loading data
print('Loading dataset...')
if config.ALREADY_SPLIT:
    train_df = pd.read_csv(config.TRAIN_FILE) 
    val_df = pd.read_csv(config.VALIDATION_FILE)    
    print('Training set shape: '+ str(train_df.shape))
    print('Validaiton set shape: '+ str(val_df.shape))
    print('Loading finished.')
else:
    data_df = pd.read_csv(config.INPUT_FILE)   
    train_df, test_df = train_test_split(data_df, test_size=config.TEST_SIZE, shuffle=True, random_state=config.RANDOM_STATE)
    train_df, val_df = train_test_split(train_df, test_size=config.VALIDATION_SIZE, shuffle=True, random_state=config.RANDOM_STATE)  
    print('Training set shape: '+ str(train_df.shape))
    print('Validaiton set shape: '+ str(val_df.shape))
    print('Test set shape: '+ str(test_df.shape))
    print('Saving training set & validation set & test set to local...')
    print('Loading finished.')
    train_df.to_csv(config.TRAIN_FILE, index=False)
    val_df.to_csv(config.VALIDATION_FILE, index=False)
    test_df.to_csv(config.TEST_FILE, index=False)
    print('Saving finished.')
if config.UNDER_SAMPLING:
    train_df = undersampling(train_df)
    val_df = undersampling(val_df)

Loading dataset...
Training set shape: (69685, 3)
Validaiton set shape: (17421, 3)
Test set shape: (15371, 3)
Saving training set & validation set & test set to local...
Loading finished.
Saving finished.


In [4]:
## Processing data
print('Processing dataset...')
if not config.USE_SIAMESE:
    train_set = CustomDataset(
        sentence_pairs=train_df[[config.SENTENCE_FIELD, config.SENTENCE_FIELD2]].values.astype("str"),
        labels=train_df[config.LABEL_FIELD].values.astype("float32"),
        batch_size=config.BATCH_SIZE
    )
    val_set = CustomDataset(
        sentence_pairs=val_df[[config.SENTENCE_FIELD, config.SENTENCE_FIELD2]].values.astype("str"),
        labels=val_df[config.LABEL_FIELD].values.astype("float32"),
        batch_size=config.BATCH_SIZE
    )
else:
    train_set = CustomDatasetSiamese(
        sent=train_df[config.SENTENCE_FIELD].values.astype("str"),
        sent2=train_df[config.SENTENCE_FIELD2].values.astype("str"),
        labels=train_df[config.LABEL_FIELD].values.astype("float32"),
        batch_size=config.BATCH_SIZE
    )
    val_set = CustomDatasetSiamese(
        sent=val_df[config.SENTENCE_FIELD].values.astype("str"),
        sent2=val_df[config.SENTENCE_FIELD2].values.astype("str"),        
        labels=val_df[config.LABEL_FIELD].values.astype("float32"),
        batch_size=config.BATCH_SIZE
    )
print('Processing finished.')

Processing dataset...
Processing finished.


In [8]:
val_set[0][0][0][0]   # token id

<tf.Tensor: shape=(64,), dtype=int32, numpy=
array([ 101, 6874, 3309, 6820, 3621, 1400,  711,  784,  720, 5709, 1446,
       1108, 5310,  102, 2769, 4638, 6010, 6009, 5709, 1446, 2347, 6820,
       3621,  711,  784,  720, 6820, 6206, 1108, 5310,  102,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)>

In [9]:
val_set[0][0][1][0]  # attention mask id

<tf.Tensor: shape=(64,), dtype=int32, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32)>

In [10]:
val_set[0][0][2][0]  # token type id (sengment id)

<tf.Tensor: shape=(64,), dtype=int32, numpy=
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32)>

In [1]:
import tensorflow as tf
from tensorflow.keras import backend as K

In [8]:
x, y = tf.constant([[1,2,3],[1,2,3]],dtype=tf.float32),tf.constant([[4,5,6],[1,2,3]],dtype=tf.float32)
square = tf.math.square(x - y)
print(square)


tf.Tensor(
[[9. 9. 9.]
 [0. 0. 0.]], shape=(2, 3), dtype=float32)


In [9]:
sum_square = tf.math.reduce_sum(square, axis=1, keepdims=True)
print(sum_square)


tf.Tensor(
[[27.]
 [ 0.]], shape=(2, 1), dtype=float32)


In [10]:
result = tf.math.sqrt(tf.math.maximum(sum_square, K.epsilon()))  
print(result)

tf.Tensor(
[[5.1961522e+00]
 [3.1622776e-04]], shape=(2, 1), dtype=float32)
