In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-20-questions/llm_20_questions/llm_20_questions.json
/kaggle/input/llm-20-questions/llm_20_questions/llm_20_questions.py
/kaggle/input/llm-20-questions/llm_20_questions/keywords.py
/kaggle/input/llm-20-questions/llm_20_questions/llm_20_questions.js


In [None]:
# 기존 코드에서는 gemma 모델을 사용했지만, 이번엔 llama3-8b 모델을 사용해보려 한다.

# from gemma.config import get_config_for_9b
# from gemma.model import GemmaForCasualLM

# 1. 환경 설정 및 데이터 불러오기


In [2]:
%%bash
mkdir ./submission

In [10]:
%%writefile -a submission/main.py

# import Tokenizer and Model
from transformers import AutoTokenizer, AutoModelForCasualLM
import torch

import os
import sys
import shutil

# Set Agent Path
KAGGLE_AGENT_PATH = './kaggle_simulations/agent/'

if os.path.exists(KAGGLE_AGENT_PATH):
    model_id = os.path.join(KAGGLE_AGENT_PATH, '1')
else:
    model_id = "/kaggle/input/llama-3/transformers/8b-chat-hf/1"
    
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCasualLM.from_pretrained(model_id, torch_dtype = torch.bfloat16, device_map="auto")
id_eot = tokenizer.convert_tokens_to_ids(["<|eot_id|>"])[0]

def generate_answer(template):
    inp_ids = tokenizer(template, return_tensors="pt").to("cuda")
    out_ids = model.generate(**inp_ids,max_new_tokens=15).squeeze()
    start_gen = inp_ids.input_ids.shape[1]
    out_ids = out_ids[start_gen:]
    if id_eot in out_ids:
        stop = out_ids.tolist().index(id_eot)
        out = tokenizer.decode(out_ids[:stop])
    else:
        out = tokenizer.decode(out_ids)
    return out


class Robot:
    def __init__(self):
        pass
    
    def on(self, mode, obs):
        assert mode in ["asking", "guessing", "answering"], "mode can only take one of these values: asking, answering, guessing"
        if mode == "asking":
            #launch the asker role
            output = self.asker(obs)
        if mode == "answering":
            #launch the answerer role
            output = self.answerer(obs)
            if "yes" in output.lower():
                output = "yes"
            elif "no" in output.lower():
                output = "no"   
            if ("yes" not in output.lower() and "no" not in output.lower()):
                output = "yes"
        if mode == "guessing":
            #launch the guesser role
            output = self.asker(obs)
        return output
    
    
    def asker(self, obs):
        sys_prompt = """You are a helpful AI assistant, and your are very smart in playing 20 questions game,
        the user is going to think of a word, it can be only one of the following 3 categories:
        1. a place
        2. a person
        3. a thing
        So focus your area of search on these options. and give smart questions that narrows down the search space\n"""
    
        if obs.turnType =="ask":
            ask_prompt = sys_prompt + """your role is to find the word by asking him up to 20 questions, your questions to be valid must have only a 'yes' or 'no' answer.
            to help you, here's an example of how it should work assuming that the keyword is Morocco:
            examle:
            <you: is it a place?
            user: yes
            you: is it in europe?
            user: no
            you: is it in africa?
            user: yes
            you: do most people living there have dark skin?
            user: no
            user: is it a country name starting by m ?
            you: yes
            you: is it Morocco?
            user: yes.>

            the user has chosen the word, ask your first question!
            please be short and not verbose, give only one question, no extra word!"""
            chat_template = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{ask_prompt}<|eot_id|>"""
            chat_template += "<|start_header_id|>assistant<|end_header_id|>\n\n"
            if len(obs.questions)>=1:
                for q, a in zip(obs.questions, obs.answers):
                    chat_template += f"{q}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
                    chat_template += f"{a}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
                    
        elif obs.turnType == "guess":
            conv = ""
            for q, a in zip(obs.questions, obs.answers):
                conv += f"""Question: {q}\nAnswer: {a}\n"""
            guess_prompt =  sys_prompt + f"""so far, the current state of the game is as following:\n{conv}
            based on the conversation, can you guess the word, please give only the word, no verbosity around"""
            chat_template = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{guess_prompt}<|eot_id|>"""
            chat_template += "<|start_header_id|>assistant<|end_header_id|>\n\n"
                
        output = generate_answer(chat_template)        
        return output
        
        
        
    def answerer(self, obs):
        sys_prompt = f"""you are a helpful AI assistant, and your are very smart in playing 20 questions game,
        the role of the user is to guess the word by asking you up to 20 questions, your answers to be valid must be a 'yes' or 'no', any other answer is invalid and you lose the game.
        Know that the user will always guess a word belonging to one of the following 3 categories:
        1. a place
        2. a person
        3. a thing
        so make sure you understand the user's question and you understand the keyword you're playig on.
        for now the word that the user should guess is: "{obs.keyword}", it is of category "{obs.category}",
        to help you, here's an example of how it should work assuming that the keyword is Morocco in the category "place":
        examle:
        <user: is it a place?
        you: yes
        user: is it in europe?
        you: no
        user: is it in africa?
        you: yes
        user: do most people living there have dark skin?
        you: no
        user: is it a country name starting by m ?
        you: yes
        user: is it Morocco?
        you: yes.>"""
        
        chat_template = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{sys_prompt}<|eot_id|>"""
        chat_template += "<|start_header_id|>user<|end_header_id|>\n\n"
        chat_template += f"{obs.questions[0]}<|eot_id|>"
        chat_template += "<|start_header_id|>assistant<|end_header_id|>\n\n"
        if len(obs.answers)>=1:
            for q, a in zip(obs.questions[1:], obs.answers):
                chat_template += f"{a}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
                chat_template += f"{q}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
        output = generate_answer(chat_template)
        return output
    
    
robot = Robot()


def agent(obs, cfg):
    
    if obs.turnType =="ask":
        response = robot.on(mode = "asking", obs = obs)
        
    elif obs.turnType =="guess":
        response = robot.on(mode = "guessing", obs = obs)
        
    elif obs.turnType =="answer":
        response = robot.on(mode = "answering", obs = obs)
        
    if response == None or len(response)<=1:
        response = "yes"
        
    return response

Appending to submission/main.py


In [11]:
!apt install pigz pv > /dev/null





In [12]:
!tar --use-compress-program='pigz --fast --recursive | pv' -cf submission.tar.gz -C /kaggle/input/llama-3/transformers/8b-chat-hf . -C /kaggle/working/submission .

tar: /kaggle/input/llama-3/transformers/8b-chat-hf: Cannot open: No such file or directory
tar: Error is not recoverable: exiting now


# 파일 살펴보기

데이터셋이 업데이트되었다.
kaggle-environments/kaggle_environments/envs/llm_20_questions
/keywords.py  
(https://github.com/Kaggle/kaggle-environments/blob/master/kaggle_environments/envs/llm_20_questions/keywords.py)
- 확인해보면 country, city, landmark 가 전부 place로 바뀌었고,
- things가 대거 추가되었다고 한다.

In [7]:
# 기존 로컬 패스
# file_path = "kaggle_environments/envs/llm_20_questions/keywords.py" 

# data file in github
# 데이터 업데이트로 키워드에 있는 내용이 조금 수정됨.
# Hidden keywords가 존재하므로, decision tree base로 진행하기에는 조금 어려울 수 있을듯.
!wget https://github.com/Kaggle/kaggle-environments/blob/master/kaggle_environments/envs/llm_20_questions/keywords.py -O keywords.py

--2024-07-05 15:31:39--  https://github.com/Kaggle/kaggle-environments/blob/master/kaggle_environments/envs/llm_20_questions/keywords.py
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: 'keywords.py'

keywords.py             [  <=>               ] 392.88K  1.54MB/s    in 0.2s    

2024-07-05 15:31:40 (1.54 MB/s) - 'keywords.py' saved [402310]



아이디어  
1) vector db를 만드는 건 어떨까? (어떻게 만드는건진 아직 모름)

In [16]:
# 키워드 파일 경로 추가
import sys
import json
from pprint import pprint as pp

keywords_path = "/kaggle/input/llm-20-questions/llm_20_questions/"
sys.path.insert(0, keywords_path)
json_file_path = '/kaggle/input/llm-20-questions/llm_20_questions/llm_20_questions.json'


# 키워드 파일 임포트 시도
try:
    import keywords
    print("keywords 모듈 임포트 성공!")
except ModuleNotFoundError:
    print("keywords 모듈을 찾을 수 없습니다. 경로를 확인하세요.")


keywords 모듈 임포트 성공!


In [9]:
with open(file_path) as json_file:
    json_data = json.load(json_file)

keywords 모듈 임포트 성공!


In [19]:
file_path = "./keywords.py"

In [25]:
with open('./keywords.py', 'r') as file:
    content = file.read()
    print(content)







<!DOCTYPE html>
<html
  lang="en"
  
  data-color-mode="auto" data-light-theme="light" data-dark-theme="dark"
  data-a11y-animated-images="system" data-a11y-link-underlines="true"
  >



  <head>
    <meta charset="utf-8">
  <link rel="dns-prefetch" href="https://github.githubassets.com">
  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">
  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">
  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">
  <link rel="preconnect" href="https://github.githubassets.com" crossorigin>
  <link rel="preconnect" href="https://avatars.githubusercontent.com">

  


  <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/light-efd2f2257c96.css" /><link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/dark-6b1e37da2254.css" /><link data-color-theme="dark_dimmed" crossorigin="anonymous" medi

In [22]:
# Keywords가 어떻게 생긴 파일인지 조금 더 살펴보자 (EDA)

import json

# json file의 형태로 되어있어서, 이것만 골라내려 함.
with open(file_path, 'r') as f:
    file_contents = f.read()
    
    # JSON 데이터만 추출
    start_idx = file_contents.find('KEYWORDS_JSON = """') + len('KEYWORDS_JSON = """')
    end_idx = file_contents.find('"""', start_idx)
    json_data_str = file_contents[start_idx:end_idx].strip()
    
    # JSON parsing
#     json_data = json.loads(json_data_str)

In [27]:
import requests

# GitHub 파일 URL
url = 'https://github.com/Kaggle/kaggle-environments/blob/master/kaggle_environments/envs/llm_20_questions/keywords.py'

# 파일 다운로드
response = requests.get(url)
response.raise_for_status()  # 요청에 실패하면 예외를 발생시킵니다.

# 파일 저장
with open('downloaded_data.py', 'w', encoding='utf-8') as file:
    file.write(response.text)

print("파일이 성공적으로 다운로드되었습니다.")

<_io.TextIOWrapper name='downloaded_data.py' mode='w' encoding='utf-8'>
파일이 성공적으로 다운로드되었습니다.


In [30]:
# 파일을 열고 데이터 부분을 추출합니다.
with open('downloaded_data.py', 'r', encoding='utf-8') as file:
    content = file.read()
    print(content)
# # "KEYWORDS_JSON = """ 부분을 찾아서 JSON 데이터를 추출합니다.
# start = content.find('KEYWORDS_JSON = """') + len('KEYWORDS_JSON = """')
# end = content.find('"""', start)
# json_data = content[start:end].strip()

# print(json_data)  # JSON 데이터 확인







<!DOCTYPE html>
<html
  lang="en"
  
  data-color-mode="auto" data-light-theme="light" data-dark-theme="dark"
  data-a11y-animated-images="system" data-a11y-link-underlines="true"
  >



  <head>
    <meta charset="utf-8">
  <link rel="dns-prefetch" href="https://github.githubassets.com">
  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">
  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">
  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">
  <link rel="preconnect" href="https://github.githubassets.com" crossorigin>
  <link rel="preconnect" href="https://avatars.githubusercontent.com">

  


  <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/light-efd2f2257c96.css" /><link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/dark-6b1e37da2254.css" /><link data-color-theme="dark_dimmed" crossorigin="anonymous" medi

In [23]:
pp(json_data[0])

KeyError: 0

In [None]:
# category 별 분포.
# 첫 질문이 is it a city? 로만 시작해도 절반은 거르고 갈 수 있음
categories = [(item['category'], len(item['words'])) for item in json_data]
print(categories)

In [None]:
json_data[1]['words']