In [2]:
import os
import glob
from pathlib import Path
from copy import deepcopy

import json
import geojson
from typing import Any, Dict, Tuple
from dotenv import load_dotenv

import torch
from transformers import (AutoTokenizer,
                          pipeline,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          AutoConfig, GenerationConfig)

In [3]:
ROOT = Path('').resolve().parent

In [4]:
load_dotenv(ROOT / 'config.env')

True

In [5]:
MODEL_NAME = 'meta-llama/Meta-Llama-3-8B-Instruct' #"meta-llama/Meta-Llama-3-8B"
HF_TOKEN = os.environ.get('HF_TOKEN')
SAVE_FOLDER = Path(ROOT / 'llama_answers_dataset_en_instruction').resolve()
SAVE_FOLDER.mkdir(exist_ok=True, parents=True)

In [6]:
# Building area should be rounded to 3 decimal places.
# If the answer means using numerical characteristics of the building, round all numbers in the answer.
# Number of residents in the buildings should always be an integer in answer.
# Если в ответе идет речь про численные характеристики здания, округли все цифры в ответе по математическии, например 115.4 станет 115, 234.8 станет 235.

In [50]:
NUMERICAL_DATA = ['building_area', 'living_area', 'storeys_count', 'resident_number', 'population_balanced', 'lift_count', 'building_year']


def preprocess_context(data: Dict) -> Dict:
    """Preprocess context data by rounding all digits in manually set fields."""
    preprocessed = deepcopy(data)
    for chunk in preprocessed['features']:
        properties = chunk['properties']
        for num_col in NUMERICAL_DATA:
            if properties.get(num_col) is not None:
                properties[num_col] = round(float(properties[num_col]))
    return preprocessed
        


def get_prompt(question: str, context: Dict, *args, **kwargs) -> str:
    """Function for intialization of LLAMA3 prompt template."""
    default = '''You are a smart AI assistant, developed to help users with their questions.'''
    default_rules = f'''Answer the question following rules below. For answer you must use provided by user context. Your answer should be in Russian language.
    Rules:
    The answer should have three sentences maximum. The answer should be short, without any explanation, and redundant or unrelated information.
    Add a unit of measurement to an answer.
    If there are several organizations in the building, all of them should be mentioned in the answer.
    The building's address (street, house number, building) in the user's question should exactly match a building address from the context.
    For answer you should take only that infromation from context, which exactly match a building address (street, house number, building) from the user's question.
    If provided by user context for a given address has "null" or "None" for the property, it means the data about this property of the building is absent.
    In questions about building failure, 0 in the context's corresponding field means "no", and 1 - means "yes".
    If data for an answer is absent, answer that data was not provided or absent and mention for what field there was no data.
    If you do not know how to answer the questions, say so.'''
    system_prompt = kwargs.get('system_prompt', default)
    rules = kwargs.get('additional_rules', default_rules)
    template = f"""
            <|begin_of_text|><|start_header_id|>system<|end_header_id|>
            {system_prompt} {rules}<|eot_id|>
            <|start_header_id|>user<|end_header_id|>
            Контекст :{context} Вопрос: {question}<|eot_id|>
            <|start_header_id|>assistant<|end_header_id|>
            """
    return template


def save_answer_as_json(answer: Dict) -> None:
    total_files_in_folder = len(glob.glob1(str(SAVE_FOLDER), '*.json'))
    f_pref = 'llama_ans'
    with open((SAVE_FOLDER/f'{f_pref}_{total_files_in_folder + 1}.json').resolve(), 'w', encoding='utf-8') as pth:
        json.dump(answer, pth)


def get_query(idx: int) -> Tuple[Dict, Dict]:
    """Load set of queries and context to them."""
    with open(Path(ROOT, 'data', 'datasets', f'data_{idx}.json')) as json_data:
        questions = json.load(json_data)
    with open(Path(ROOT, 'data', 'buildings', f'buildings_part_{idx}.geojson')) as buildings_data:
        manual_context = geojson.load(buildings_data)
    return questions, manual_context


def multi_ans(model: Any, amount: int = 10, **kwargs) -> None:
    """
    Get multiple answers from given model.
    This function loads contexts from several files
    """
    generation_temperature = kwargs.get('temperature', .5)
    for i in range(amount):
        queries, context = get_query(i)
        total_questions = list(queries.keys())
        for q_id in range(len(total_questions) // 5):
            # Pick one query from the list
            question_response_pair: Dict= queries[total_questions[q_id]]            
            query, response = question_response_pair['query'], question_response_pair['response']

            #Form a prompt from query and context
            prompt = get_prompt(question=query, context=preprocess_context(context))
            answer = model(prompt, temperature=generation_temperature)

            json_ans = {'query': query, 
                        'llama_answer': answer[0]["generated_text"].split("<|end_header_id|>")[-1], 
                        'ideal_ans': response,
                        'chunk': i,
                        'question_number': total_questions[q_id]}
            save_answer_as_json(json_ans)
    print(f'Answers have been saved to {SAVE_FOLDER}. Amount: {len(glob.glob1(str(SAVE_FOLDER), "*.json"))}')


In [8]:
quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
model_config = AutoConfig.from_pretrained(MODEL_NAME, trust_remote_code=True, max_new_tokens=12000, force_download=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)



config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    # quantization_config=quantization_config, ## Uncomment, if quantization is required
    torch_dtype=torch.float16,
    device_map="auto",
    token=HF_TOKEN,
    trust_remote_code=True
)
model.generation_config.pad_token_ids = tokenizer.pad_token_id
model.eval()
print('Model is ready.')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model is ready.


In [10]:
pipeline = pipeline(
    'text-generation', 
    model=model,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.eos_token_id,
    max_length=12000,
    device_map='auto',
    )

In [11]:
# MULTI ans testing
# multi_ans(pipeline, amount=12)

In [51]:
questions, manual_context = get_query(5)

In [52]:
# File in question has multiple varinats of questions, we have to pick one
question_id = list(questions.keys())[25]
question_response_pair = questions[question_id]
manual_question = question_response_pair['query']
target = question_response_pair['response']

print(f'Question_id: {question_id}')
print(f'Question: {manual_question}')

Question_id: 51_3
Question: Какой тип проекта застройки дома по адресу "Санкт-Петербург, Шепетовская, 3"?


In [53]:
query = get_prompt(question=manual_question, context=preprocess_context(manual_context))

In [54]:
answer = pipeline(query, temperature=.2)

print(f'Вопрос: {manual_question}')
print(f'Верный ответ: {target}')
print(f'Ответ: {answer[0]["generated_text"].split("<|end_header_id|>")[-1]}')

Вопрос: Какой тип проекта застройки дома по адресу "Санкт-Петербург, Шепетовская, 3"?
Верный ответ: Для дома по адресу "Санкт-Петербург, Шепетовская, 3" данная информация отсутствует
Ответ: 
             Тип проекта застройки дома по адресу "Санкт-Петербург, Шепетовская, 3" - "1-528".
