In [1]:
!pip install transformers 

import numpy as np
import pandas as pd
import tensorflow as tf
import transformers



In [2]:
max_length = 128  # Maximum length of input sentence to the model.
batch_size = 32
epochs = 2

# Labels in our dataset.
labels = ["similar", "not_similar"]

In [6]:
# Read data in
train_df = pd.read_csv("train.tsv", sep='\t', on_bad_lines='skip')
test_df = pd.read_csv("test.tsv", sep='\t', on_bad_lines='skip')
dev_df = pd.read_csv("dev.tsv", sep='\t', on_bad_lines='skip')

In [7]:
# Shape of the data
print(f"Total train samples: {train_df.shape[0]}")
print(f"Total test samples: {test_df.shape[0]}")
print(f"Total dev samples: {dev_df.shape[0]}")

Total train samples: 3458
Total test samples: 1639
Total dev samples: 480


In [8]:
train_df.head(5)

Unnamed: 0,Quality,#1 ID,#2 ID,#1 String,#2 String
0,1,702876,702977,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi..."
1,0,2108705,2108831,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...
2,1,1330381,1330521,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an..."
3,0,3344667,3344648,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ..."
4,1,1236820,1236712,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...


In [9]:
train_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3458 entries, 0 to 3457
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Quality    3458 non-null   int64 
 1   #1 ID      3458 non-null   int64 
 2   #2 ID      3458 non-null   int64 
 3   #1 String  3458 non-null   object
 4   #2 String  3441 non-null   object
dtypes: int64(3), object(2)
memory usage: 135.2+ KB


In [10]:
# Delete null objects
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)
dev_df.dropna(inplace=True)

In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3441 entries, 0 to 3457
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Quality    3441 non-null   int64 
 1   #1 ID      3441 non-null   int64 
 2   #2 ID      3441 non-null   int64 
 3   #1 String  3441 non-null   object
 4   #2 String  3441 non-null   object
dtypes: int64(3), object(2)
memory usage: 161.3+ KB


In [13]:
# Train target distribution
print("Train target distribution")
print(train_df.Quality.value_counts())

Train target distribution
Quality
1    2316
0    1125
Name: count, dtype: int64


In [14]:
# Valid target distribution
print("Valid target distribution")
print(dev_df.Quality.value_counts())

Valid target distribution
Quality
1    330
0    146
Name: count, dtype: int64


In [16]:
train_df["similarity"] = train_df["Quality"].apply(
    lambda x: "not_similar" if x == 0 else "similar"
)
y_train = tf.keras.utils.to_categorical(train_df.Quality, num_classes=2)

dev_df["similarity"] = train_df["Quality"].apply(
    lambda x: "not_similar" if x == 0 else "similar"
)
y_dev = tf.keras.utils.to_categorical(dev_df.Quality, num_classes=2)

test_df["similarity"] = train_df["Quality"].apply(
    lambda x: "not_similar" if x == 0 else "similar"
)
y_test = tf.keras.utils.to_categorical(test_df.Quality, num_classes=2)

In [17]:
class BertSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.

    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.

    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)
