# Installing Packages

In [1]:
%%capture
!pip install git+https://github.com/huggingface/transformers.git  -U 
!pip install git+https://github.com/huggingface/accelerate.git  -U 
!pip install bitsandbytes 
!pip install git+https://github.com/huggingface/peft.git  -U 

# Import Libs

In [2]:
import numpy as np
import pandas as pd

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from peft import PeftModel, PeftConfig

import os, re, string, ast
import random
from collections import Counter
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=UserWarning, module="transformers")

# Setup model

In [3]:
bnb_config = BitsAndBytesConfig(  
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.float16,
    bnb_4bit_use_double_quant= True,
llm_int8_enable_fp32_cpu_offload= True)

In [4]:
torch.cuda.empty_cache()

In [5]:
model = AutoModelForCausalLM.from_pretrained(
        '/kaggle/input/mixtral/pytorch/8x7b-instruct-v0.1-hf/1',
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,)

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [6]:
from transformers import pipeline, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/mixtral/pytorch/8x7b-instruct-v0.1-hf/1', trust_remote_code=True)

In [7]:
def generate(prompt, max_new_tokens=1) : 
        pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
        sequences = pipe(
            prompt,
            do_sample=True,
            max_new_tokens=max_new_tokens, 
            temperature=0.7, 
            top_p=0.95
        )
        response = re.sub(r'[\'"]', '', sequences[0]['generated_text'].split("[/INST]")[1])
        return response.strip()

# Load data and process

In [8]:
df_train = pd.read_csv("/kaggle/input/fpt-ai-residency-batch-6-entry-test/b6_train_data.csv")
df_train.head(3)

Unnamed: 0,task_id,question,choices,answer
0,k10168,Question: What will be output of the following...,"['8 4 2', '8 4 2', '8 4 4', '8 4 3']",C
1,k10173,Question: What will be output of the following...,"['-4', '-5', '10', '11']",A
2,k10174,Question: Match the following.\n Group 1 ...,"['P-4. Q-1, R-2, S-3', 'P-3, Q-1, R-4, S-2', '...",B


In [9]:
df_test = pd.read_csv("/kaggle/input/fpt-ai-residency-batch-6-entry-test/b6_test_data.csv")
df_test.head(3)

Unnamed: 0,task_id,question,choices
0,k10171,Question: What will be output of the following...,"['10', '9', '8', 'Error']"
1,k10182,Question: Consider line 3. Identify the compil...,"['No compilation error', 'Only a lexical error..."
2,k10184,Question: Assume the conflicts part (a) of thi...,['Equal precedence and left associativity; exp...


In [10]:
print(df_train['answer'].unique())
df_train.describe()

['C' 'A' 'B' 'D' 'ANSWER: C' 'ANSWER: B' 'ANSWER: D' 'ANSWER: A' 'E' 'G'
 nan 'ANSWER:  D']


Unnamed: 0,task_id,question,choices,answer
count,3963,3963,3963,3949
unique,3963,3826,3451,11
top,k00701,Question: What is Conditional Rendering?,"['False', 'True']",A
freq,1,7,22,988


In [11]:
df_train = df_train.dropna()
df_train = df_train.drop_duplicates()
df_train['answer'] = df_train['answer'].astype(str).str.strip().str[-1]
print(df_train['answer'].unique())
df_train.describe()

['C' 'A' 'B' 'D' 'E' 'G']


Unnamed: 0,task_id,question,choices,answer
count,3949,3949,3949,3949
unique,3949,3814,3450,6
top,k00701,Question: What is Conditional Rendering?,"['False', 'True']",A
freq,1,7,22,1066


In [12]:
df_test.describe()

Unnamed: 0,task_id,question,choices
count,1253,1253,1253
unique,1253,1242,1163
top,k00700,Question: Match the following:,"['(1)', '(2)', '(3)', '(4)']"
freq,1,2,7


# Inference

In [13]:
index2choice = {i: letter for i, letter in enumerate(string.ascii_uppercase)}
choice2index = {letter: i for i, letter in enumerate(string.ascii_uppercase)}

def choices2str(choices):
    choices_lst = ast.literal_eval(choices)
    result = ""
    for i in range(0, len(choices_lst)):
        result = result + index2choice[i] + ". " + str(choices_lst[i]) + "\n\n"
    return result.strip()

In [14]:
def response_parser(response):
    if len(response) == 1 and response.upper() in list(choice2index.keys()):
        return response

    response = response.translate(str.maketrans('', '', string.punctuation))
    response_split = response.split()
    for re in response_split:
        if re in list(choice2index.keys()):
            return re
    return random.choice(["A", "B", "C", "D"])

In [15]:
def formatting_func(question, choices):
    text = f"""<s>[INST]You are a programming expert and will answer multiple-choice questions about code.  
Read the following question and options carefully and select the **best** answer.  

### Response Format:
- Reply with **only** the letter of the correct choice (A, B, C, or D).  
- Do **not** provide explanations. 

### Question:
{question}

### Options:
{choices2str(choices)}

### Response:
[/INST]"""
    return text

In [16]:
data_dict = {
    "task_id": [],
    "answer": []
}

In [17]:
for i in tqdm(range(0, df_test.shape[0]), desc="Inference on Test set"):
    task_id = df_test['task_id'].iloc[i]
    prompt = formatting_func(df_test["question"].iloc[i], df_test["choices"].iloc[i])
    response = generate(prompt)
    answer = response_parser(response)
    data_dict['task_id'].append(task_id)
    data_dict['answer'].append(answer)
    clear_output(wait=True) # Transformers log so much

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


# Submission

In [18]:
submission = pd.DataFrame(data_dict)
submission.head()

Unnamed: 0,task_id,answer
0,k10171,D
1,k10182,D
2,k10184,A
3,k10206,D
4,k10215,D


In [19]:
submission.describe()

Unnamed: 0,task_id,answer
count,1253,1253
unique,1253,5
top,k00700,A
freq,1,390


In [20]:
submission.to_csv("submission.csv", index=False)