In [1]:
import re
import os
import glob

from pathlib import Path
from copy import deepcopy

import json
import geojson
from typing import Any, Dict, Tuple
from dotenv import load_dotenv

import torch
import pandas as pd
from transformers import (AutoTokenizer,
                          pipeline,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          AutoConfig, GenerationConfig)

In [2]:
ROOT = Path('').resolve().parent

In [3]:
load_dotenv(ROOT / 'config.env')

True

In [4]:
MODEL_NAME = 'meta-llama/Meta-Llama-3-8B-Instruct' #"meta-llama/Meta-Llama-3-8B"
HF_TOKEN = os.environ.get('HF_TOKEN')
SAVE_FOLDER = Path(ROOT / 'llama_answers_dataset_en_instruction').resolve()
SAVE_FOLDER.mkdir(exist_ok=True, parents=True)

In [5]:
NUMERICAL_DATA = ['building_area', 'living_area', 'storeys_count', 'resident_number', 'population_balanced', 'lift_count', 'building_year']


def preprocess_context(data: Dict) -> Dict:
    """Preprocess context data by rounding all digits in manually set fields."""
    preprocessed = deepcopy(data)
    for chunk in preprocessed['features']:
        properties = chunk['properties']
        for num_col in NUMERICAL_DATA:
            if properties.get(num_col) is not None:
                properties[num_col] = round(float(properties[num_col]))
    return preprocessed
        


def get_prompt(question: str, context: Dict, *args, **kwargs) -> str:
    """Function for intialization of LLAMA3 prompt template."""
    default = '''Your name is Larry, You are smart AI assistant, You have high experitce in field of city building, urbanistic and Structure of St. Petersburg.'''
    default_rules = f'''Answer the question following rules below. For answer you must use provided by user context.
    Rules:
    1. The answer should have three sentences maximum.
    2. Add a unit of measurement to an answer.
    3. If there are several organizations in the building, all of them should be mentioned in the answer.
    4. The building's address (street, house number, building) in the user's question should exactly match a building address from the context.
    5. For answer you should take only that infromation from context, which exactly match a building address (street, house number, building) from the user's question.
    6. If provided by user context for a given address has "null" or "None" for the property, it means the data about this property of the building is absent.
    7. In questions about building failure, 0 in the context's corresponding field means "no", and 1 - means "yes".
    8. If data for an answer is absent, answer that data was not provided or absent and mention for what field there was no data.
    9. If you do not know how to answer the questions, say so.
    10. Before give an answer to the user question, provide explanation. Mark the answer with keyword "ANSWER", and explanation with "EXPLANATION".
    11. You must use only provided information for the answer.
    12. Your answer should be in Russian language.'''
    system_prompt = kwargs.get('system_prompt', default)
    rules = kwargs.get('additional_rules', default_rules)
    template = f"""
            <|begin_of_text|><|start_header_id|>system<|end_header_id|>
            {system_prompt} {rules}<|eot_id|>
            <|start_header_id|>user<|end_header_id|>
            Контекст :{context} Вопрос: {question}<|eot_id|>
            <|start_header_id|>assistant<|end_header_id|>
            """
    return template


def save_answer_as_json(answer: Dict) -> None:
    total_files_in_folder = len(glob.glob1(str(SAVE_FOLDER), '*.json'))
    f_pref = 'llama_ans'
    with open((SAVE_FOLDER/f'{f_pref}_{total_files_in_folder + 1}.json').resolve(), 'w', encoding='utf-8') as pth:
        json.dump(answer, pth)


def get_query(idx: int) -> Tuple[Dict, Dict]:
    """Load set of queries and context to them."""
    with open(Path(ROOT, 'data', 'datasets', f'data_{idx}.json')) as json_data:
        questions = json.load(json_data)
    with open(Path(ROOT, 'data', 'buildings', f'buildings_part_{idx}.geojson')) as buildings_data:
        manual_context = geojson.load(buildings_data)
    return questions, manual_context


def multi_ans(model: Any, amount: int = 10, **kwargs) -> None:
    """
    Get multiple answers from given model.
    This function loads contexts from several files
    """
    generation_temperature = kwargs.get('temperature', .5)
    for i in range(amount):
        queries, context = get_query(i)
        total_questions = list(queries.keys())
        for q_id in range(len(total_questions) // 5):
            # Pick one query from the list
            question_response_pair: Dict= queries[total_questions[q_id]]            
            query, response = question_response_pair['query'], question_response_pair['response']

            #Form a prompt from query and context
            prompt = get_prompt(question=query, context=preprocess_context(context))
            answer = model(prompt, temperature=generation_temperature)

            json_ans = {'query': query, 
                        'llama_answer': answer[0]["generated_text"].split("<|end_header_id|>")[-1], 
                        'ideal_ans': response,
                        'chunk': i,
                        'question_number': total_questions[q_id]}
            save_answer_as_json(json_ans)
    print(f'Answers have been saved to {SAVE_FOLDER}. Amount: {len(glob.glob1(str(SAVE_FOLDER), "*.json"))}')


def get_question_context_ans_strategy(idx: int) -> Tuple[str]:
    """RAG imulation with csv dataset."""
    strategy_dataset = pd.read_csv(ROOT / 'data' / 'strategy_questions.csv')
    strategy_dataset.rename(columns={'Примеры 29.05': 'Question', 'Unnamed: 1': 'Context', 'Unnamed: 2': 'Answer'}, inplace=True)
    strategy_dataset.drop(0, inplace=True)
    return strategy_dataset['Question'][idx+1], strategy_dataset['Context'][idx+1], strategy_dataset['Answer'][idx+1]


def process_str(s: str) -> str:
    """Remove line braking symblos from the string."""
    trans_s = s.translate(str.maketrans('\n\t\r', '   '))
    return re.compile(r"\s+").sub(" ", trans_s).strip()

In [6]:
quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
model_config = AutoConfig.from_pretrained(MODEL_NAME, trust_remote_code=True, max_new_tokens=12000, force_download=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)



config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    # quantization_config=quantization_config, ## Uncomment, if quantization is required
    torch_dtype=torch.float16,
    device_map="auto",
    token=HF_TOKEN,
    trust_remote_code=True
)
model.generation_config.pad_token_ids = tokenizer.pad_token_id
model.eval()
print('Model is ready.')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model is ready.


In [8]:
pipeline = pipeline(
    'text-generation', 
    model=model,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.eos_token_id,
    max_length=12000,
    device_map='auto',
    )

In [9]:
# MULTI ans testing
# multi_ans(pipeline, amount=12)

In [10]:
# File in question has multiple varinats of questions, we have to pick one
questions, manual_context = get_query(5)

question_id = list(questions.keys())[25]
question_response_pair = questions[question_id]
manual_question = process_str(question_response_pair['query'])
target = process_str(question_response_pair['response'])

print(f'Question_id: {question_id}')
print(f'Question: {manual_question}')

Question_id: 51_3
Question: Какой тип проекта застройки дома по адресу "Санкт-Петербург, Шепетовская, 3"?


In [11]:
query = get_prompt(question=manual_question, context=preprocess_context(manual_context))

In [12]:
answer = pipeline(query, temperature=.2)
answer_edited = process_str(answer[0]["generated_text"].split("<|end_header_id|>")[-1])

print(f'● Question: {manual_question}')
print(f'● Estimated answer: {target}')
print(f'● Answer: {answer_edited.split("ANSWER")[-1]}')
print(f'● Explanation: {answer_edited.split("ANSWER")[0]}')

● Question: Какой тип проекта застройки дома по адресу "Санкт-Петербург, Шепетовская, 3"?
● Estimated answer: Для дома по адресу "Санкт-Петербург, Шепетовская, 3" данная информация отсутствует
● Answer: : Для дома по адресу "Санкт-Петербург, Шепетовская, 3" тип проекта застройки не указан, т.к. в контексте для этого поля указано "null".
● Explanation: EXPLANATION: The context provides information about several buildings in St. Petersburg, including their addresses, administrative units, and other characteristics. To answer the question, we need to find the building with the address "Санкт-Петербург, Шепетовская, 3" and check its "project_type" property. 


In [13]:
question, context, correct_ans = get_question_context_ans_strategy(0)
question = process_str(question)
context = process_str(context)
query = get_prompt(question=question, context=context)
answer = pipeline(query, temperature=.015)

answer_edited = process_str(answer[0]["generated_text"].split("<|end_header_id|>")[-1])

print(f'● Question: {question}')
print(f'● Expected response: {correct_ans}')
print(f'● Answer: {answer_edited.split("ANSWER:")[-1]}')
print(f'● Explanation: {answer_edited.split("ANSWER")[0]}')

● Question: Какая средняя обеспеченность дошкольными учреждениями?
● Expected response: Средняя обеспеченность дошкольными учреждениями в конце 2012 года в целом по Санкт-Петербургу составила
95,65% (от числа детей в возрасте от 1 до 6 лет), средний показатель обеспеченности общеобразовательными
учреждениями - 102,74%. Дефицита потребности в общеобразовательных учреждениях в целом по городу не
отмечается. Однако в районах активной жилой застройки наблюдается дисбаланс обеспеченности, как по детским
садам, так и по школам. В Санкт-Петербурге сформирована высокоразвитая сфера дополнительного образования:
более 85% детей охвачены программами дополнительного образования.
● Answer:  Средняя обеспеченность дошкольными учреждениями в конце 2012 года в целом по Санкт-Петербургу составила 95,65% (от числа детей в возрасте от 1 до 6 лет).
● Explanation: EXPLANATION: В контексте предоставленной информации мы находим информацию о средней обеспеченности дошкольными учреждениями в Санкт-Петербурге. 