In [ ]:
%conda install conda=24.1.2

In [None]:
%conda install transformers accelerate

In [None]:
import os
import json
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    MistralForCausalLM,
)
from datasets import load_dataset, load_metric

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import sagemaker
import boto3
import tarfile
import random

In [ ]:
model_path = "/home/ec2-user/SageMaker/transformers_cache/"

try:
    os.makedirs(model_path)
except OSError:
    print("Creation of the directory %s failed or it already exists" % model_path)
else:
    print("Successfully created the directory %s" % model_path)

In [None]:
sess = sagemaker.Session()
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [3]:
student_id = "mistralai/Mistral-7B-v0.1"
teacher_id = "mistralai/Mixtral-8x7B-v0.1"
dataset_id = "cais/mmlu"
dataset_config = "all"

In [None]:
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_id)
student_tokenizer = AutoTokenizer.from_pretrained(student_id)

In [5]:
sample = "Here's our sanity check."

assert teacher_tokenizer(sample) == student_tokenizer(sample), (
    "Tokenizers need to have the same output! "
    f"{teacher_tokenizer(sample)} != {student_tokenizer(sample)}"
)

In [6]:
del teacher_tokenizer
del student_tokenizer

In [8]:
idx_mapping = {
    1: "A",
    2: "B",
    3: "C",
    4: "D"
}

In [9]:
def hf_dataset_to_dict(dataset):
    return {key: dataset[key] for key in dataset.column_names}

In [10]:
def random_few_shot():
    random_key = random.choice(list(few_shot_dict.keys()))
    random_value = few_shot_dict[random_key]

    return random_value

In [11]:
def append_question_to_prompt(data, show_answer=False):
    prompt = ''
    prompt += f"{data['question']}\n\n"

    for idx, choice in enumerate(data['choices'], start=1):
        prompt += f"{idx_mapping[idx]}. {choice}\n"
    
    if show_answer:
        prompt += f"\nAnswer: {data['answer']}\n\n"
    else:
        prompt += f"\nAnswer:"
        
    return prompt

In [12]:
def get_subject_few_shot(subject_filter):
    index_list = [i for i, subject in enumerate(dev_dataset['subject']) if subject == subject_filter]

    new_data = [
        {
            'question': dev_dataset['question'][i],
            'subject': dev_dataset['subject'][i],
            'choices': dev_dataset['choices'][i],
            'answer': dev_dataset['answer'][i]
        }
        for i in index_list
    ]
    
    few_shot_prompt = ''
    
    for data in new_data:
        few_shot_prompt += append_question_to_prompt(data, show_answer=True)
        

    return few_shot_prompt

In [13]:
def format_subject(subject):
    l = subject.split("_")
    s = ""
    for entry in l:
        s += " " + entry
    return s

In [14]:
def test_process(data):
    subject = data['subject']

    prompt = f"The following are multiple choice questions (with answers) about {format_subject(subject)}.\n\n"
    prompt += few_shot_dict[subject]
    prompt += append_question_to_prompt(data)
    
    tokenized_prompt = tokenizer(prompt, truncation=True, max_length=4096, padding=True)

    return tokenized_prompt

In [15]:
def train_process(data):
    prompt = f"The following are multiple choice questions (with answers) about a random subject.\n\n"
    prompt += random_few_shot()
    prompt += append_question_to_prompt(data)
    
    tokenized_prompt = tokenizer(prompt, truncation=True, max_length=4096, padding=True)

    return tokenized_prompt

In [None]:
dataset = load_dataset(dataset_id, dataset_config)

train_dataset = dataset['auxiliary_train']
test_dataset = dataset['test']
dev_dataset = dataset['dev']

In [17]:
few_shot_dict = {}
dev_dict = hf_dataset_to_dict(dev_dataset)
subjects = list(set(dev_dict['subject']))
for subject in subjects:
    few_shot_dict[subject] = get_subject_few_shot(subject)

In [18]:
tokenizer = AutoTokenizer.from_pretrained(teacher_id)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
train_dataset = train_dataset.map(train_process)

In [None]:
test_dataset = test_dataset.map(test_process)

In [None]:
teacher_model = MistralForCausalLM.from_pretrained(teacher_id, cache_dir=model_path)
print(teacher_model)

In [None]:
student_model = MistralForCausalLM.from_pretrained(student_id, cache_dir=model_path)
print(student_model)

In [7]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    return {
        "accuracy": acc["accuracy"],
    }

In [None]:
print(test_dataset[:1]['subject'])

In [None]:
print(test_dataset[:1]['input_ids'])

In [None]:
print(test_dataset[200:201]['subject'])

In [None]:
print(test_dataset[200:201]['input_ids'])

In [None]:
print(train_dataset[1000:1001])

In [None]:
print(few_shot_dict['professional_accounting'])