In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
import random

# Load dataset
def load_dataset(file_path):
    dataset = tf.data.TextLineDataset(file_path)
    dataset = dataset.map(lambda x: tf.strings.split(x, '\t'))
    dataset = dataset.map(lambda x: {'question': x[0], 'answer1': x[1], 'answer2': x[2], 'label': tf.strings.to_number(x[3], tf.int32)})
    return dataset

# Tokenize dataset
tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b')

def tokenize_function(example):
    question = tokenizer(example['question'].numpy().decode('utf-8'), truncation=True, padding='max_length', max_length=128)
    answer1 = tokenizer(example['answer1'].numpy().decode('utf-8'), truncation=True, padding='max_length', max_length=256)
    answer2 = tokenizer(example['answer2'].numpy().decode('utf-8'), truncation=True, padding='max_length', max_length=256)
    return {
        'input_ids_question': question['input_ids'],
        'attention_mask_question': question['attention_mask'],
        'input_ids_answer1': answer1['input_ids'],
        'attention_mask_answer1': answer1['attention_mask'],
        'input_ids_answer2': answer2['input_ids'],
        'attention_mask_answer2': answer2['attention_mask'],
        'label': example['label'],
    }

def tf_tokenize_function(example):
    result = tf.py_function(tokenize_function, [example], 
                          {'input_ids_question': tf.int32, 'attention_mask_question': tf.int32, 
                           'input_ids_answer1': tf.int32, 'attention_mask_answer1': tf.int32, 
                           'input_ids_answer2': tf.int32, 'attention_mask_answer2': tf.int32, 
                           'label': tf.int32})
    result['input_ids_question'].set_shape([128])
    result['attention_mask_question'].set_shape([128])
    result['input_ids_answer1'].set_shape([256])
    result['attention_mask_answer1'].set_shape([256])
    result['input_ids_answer2'].set_shape([256])
    result['attention_mask_answer2'].set_shape([256])
    return result

# Create model
class ComparisonModel(tf.keras.Model):
    def __init__(self, base_model_name):
        super(ComparisonModel, self).__init__()
        self.base_model = TFAutoModel.from_pretrained(base_model_name)
        self.dense = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, inputs):
        question_inputs = {'input_ids': inputs['input_ids_question'], 'attention_mask': inputs['attention_mask_question']}
        answer1_inputs = {'input_ids': inputs['input_ids_answer1'], 'attention_mask': inputs['attention_mask_answer1']}
        answer2_inputs = {'input_ids': inputs['input_ids_answer2'], 'attention_mask': inputs['attention_mask_answer2']}

        question_outputs = self.base_model(**question_inputs)[0]
        answer1_outputs = self.base_model(**answer1_inputs)[0]
        answer2_outputs = self.base_model(**answer2_inputs)[0]

        # Pool the outputs
        question_embedding = tf.reduce_mean(question_outputs, axis=1)
        answer1_embedding = tf.reduce_mean(answer1_outputs, axis=1)
        answer2_embedding = tf.reduce_mean(answer2_outputs, axis=1)

        # Concatenate embeddings
        combined = tf.concat([question_embedding, answer1_embedding, answer2_embedding], axis=-1)
        output = self.dense(combined)

        return output

# Instantiate and compile model
model = ComparisonModel('google/gemma-2b')
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss = tf.keras.losses.BinaryCrossentropy()
metrics = [tf.keras.metrics.BinaryAccuracy()]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# Load and tokenize dataset for training
file_path = 'edukasi_it_qa_comparison.txt'
dataset = load_dataset(file_path)
dataset = dataset.map(tf_tokenize_function, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.batch(32)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

# Train model
model.fit(dataset, epochs=10, validation_steps=5)