In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from datasets import Dataset

In [None]:
data=pd.read_csv('/kaggle/input/kaggle-llm-science-exam/train.csv')
test=pd.read_csv('/kaggle/input/kaggle-llm-science-exam/test.csv')

In [None]:
data.head()

In [None]:
checkpoint='/kaggle/input/huggingface-bert/bert-large-cased'
batch_size= 2
options=['A','B','C','D','E']
mapping={'A':0,'B':1,'C':2,'D':3,'E':4}
reverse_mapping={0:'A',1:'B',2:'C',3:'D',4:'E'}

data['answer']=data['answer'].map(mapping)

# Dataset

In [None]:
datasets= Dataset.from_pandas(data)
datasets= datasets.rename_column("answer","label")

In [None]:
# to randomly display some dataset from the Dataset object
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML


def show_random_elements(dataset, num_examples=1):
    assert num_examples <= len(
        dataset
    ), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset) - 1)
        while pick in picks:
            pick = random.randint(0, len(dataset) - 1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))
    
# to show a question, choices and correct answer
def show_one(example):
    print(f"prompt: {example['prompt']}")
    print(f"  A - {example['A']}")
    print(f"  B - {example['B']}")
    print(f"  C - {example['C']}")
    print(f"  D - {example['D']}")
    print(f"  E - {example['E']}")
    print(f"Ground truth: option {example['label']}")

In [None]:
#show_random_elements(datasets,num_examples=3)

# Tokenizer

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Transformation

In [None]:
def preprocess_function(examples):
    first_sentences = [[context] * 5 for context in examples["prompt"]]
    second_sentences = [examples[options[i]] for i in range(5)]
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])
    tokenized_examples = tokenizer(first_sentences, second_sentences, padding = False, truncation=True)
    return {
        k: [v[i : i + 5] for i in range(0, len(v), 5)]
        for k, v in tokenized_examples.items()
    }

In [None]:
encoded_datasets=datasets.map(preprocess_function,batched=True)

In [None]:
# Fine Tuning Model

In [None]:
from transformers import TrainingArguments, Trainer, TFAutoModelForMultipleChoice
model= TFAutoModelForMultipleChoice.from_pretrained(checkpoint)

In [None]:
modelName=checkpoint.split("/")[-1]

learning_rate=0.00001
batch_size=batch_size
num_train_epochs=10
weight_decay=0.001
report_to=None
token=False
push_to_hub=False

In [None]:
# To pad the input datasets in order to transform the input datasets with the same size 

from dataclasses import dataclass
from transformers.tokenization_utils_base import (
    PreTrainedTokenizerBase,
    PaddingStrategy,
)
from typing import Optional, Union
import tensorflow as tf


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)]
            for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="np",
        )

        # Un-flatten
        batch = {
            k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()
        }
        # Add back labels
        batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
        return batch

In [None]:
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in encoded_datasets[i].items() if k in accepted_keys} for i in range(200)]
features
batch = DataCollatorForMultipleChoice(tokenizer)(features)

In [None]:
[tokenizer.decode(batch["input_ids"][0][i].numpy().tolist()) for i in range(5)]

In [None]:
data_collator = DataCollatorForMultipleChoice(tokenizer)

encoded_datasets_2 = encoded_datasets.train_test_split(test_size=0.1, seed=43)

train_set = model.prepare_tf_dataset(
    encoded_datasets_2['train'],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

val_set = model.prepare_tf_dataset(
    encoded_datasets_2['test'],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

all_set = model.prepare_tf_dataset(
    encoded_datasets,
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

In [None]:
#Fine Tuning

In [None]:

from transformers import create_optimizer

total_train_steps = (len(train_set) // batch_size) * num_train_epochs

optimizer, schedule = create_optimizer(
    init_lr=learning_rate, num_warmup_steps=0, num_train_steps=total_train_steps
)

In [None]:
import tensorflow as tf
model.compile(optimizer=optimizer,metrics=["accuracy"],)

In [None]:
from transformers.keras_callbacks import PushToHubCallback
from tensorflow.keras.callbacks import TensorBoard

model.fit(all_set,epochs=num_train_epochs,)

In [None]:
predictions = model.predict(all_set)

In [None]:
import numpy as np
def predictions_to_map_output(predictions):
    sorted_answer_indices = np.argsort(-predictions)
    top_answer_indices = sorted_answer_indices[:,:3]
    top_answers = np.vectorize(reverse_mapping.get)(top_answer_indices)
    return np.apply_along_axis(lambda row: ' '.join(row), 1, top_answers)

In [None]:
test['label']=0
test_dataset = Dataset.from_pandas(test)
tokenized_test = test_dataset.map(preprocess_function, batched=True, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E'])

In [None]:
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in tokenized_test[i].items() if k in accepted_keys} for i in range(200)]

batch = DataCollatorForMultipleChoice(tokenizer)(features)

In [None]:
test_predictions = model.predict(batch)

In [None]:
df_submission = test[['id']]
rst = predictions_to_map_output(test_predictions.logits).copy()
df_submission.loc[:, 'prediction'] = rst

In [None]:
df_submission.head()

In [None]:
df_submission.to_csv('submission.csv', index=False)