In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from transformers import pipeline, AutoModel, AutoTokenizer, AutoModelForCausalLM
import langchain 
from langchain.prompts import PromptTemplate

from langchain_huggingface import HuggingFacePipeline

from langchain_openai import ChatOpenAI

import time

## Данные

In [None]:
submit_example = pd.read_csv("submit_example.csv")
train_tests = pd.read_excel('train/tests.xlsx')
train_solutions = pd.read_excel('train/solutions.xlsx')
train_tasks = pd.read_excel('train/tasks.xlsx')

test_tests = pd.read_excel('tests.xlsx')
test_solutions = pd.read_excel('solutions.xlsx')
test_tasks = pd.read_excel('test/tasks.xlsx')

## SambaNova

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("SAMBANOVA_API_KEY")

api_key = secret_value_0

llm = ChatOpenAI(
    base_url="https://api.sambanova.ai/v1/",  
    api_key=api_key,
    streaming=True,
    model="Meta-Llama-3.1-405B-Instruct",
)


## Промпт и пайплайн

In [None]:
# Определение шаблона запроса
template = """Тебе будут даны УСЛОВИЕ ЗАДАЧИ, ОБРАЗЕЦ правильного решения, НЕПРАВИЛЬНОЕ РЕШЕНИЕ ученика, а также ПРИМЕРЫ комментариев преподавателя на другие неправильные решения. Ты должен выступить в роли учителя: проанализировать и прокомментировать ошибки в решении простой алгоритмической задачи. По стилю ответа придерживайся ПРИМЕРОВ комментариев преподавателя.

УСЛОВИЕ ЗАДАЧИ:
{task}

ОБРАЗЕЦ правильного решения:
{correct_example}

НЕПРАВИЛЬНОЕ РЕШЕНИЕ ученика:
{student_solution}

ПРИМЕРЫ комментария:
1.
{comment1}

2.
{comment2}

3.
{comment3}

4.
{comment4}

НИ ЗА ЧТО НЕ ВЫДАВАЙ ПРАВИЛЬНЫЙ ВАРИАНТ ОТВЕТА, ТОЛЬКО НАМЕКАЙ, КАК В ПРИМЕРАХ.
"""

prompt = PromptTemplate(input_variables=["task",'correct_example',"solution",'comment1','comment2','comment3','comment4'],
                        template=template)

llm_chain = prompt | llm 

In [None]:
i1=0
for solution in train_solutions['author_comment']:
    print(i1, '\n', solution)
    i1+=1

## Запуск пайплайна

In [None]:
new_submit = submit_example.copy()
# Выполнение цепочки
for i in range(len(test_solutions)):
    student_solution_id = i
    student_solution = test_solutions.loc[student_solution_id,"student_solution"]
    task_id = test_solutions.loc[student_solution_id,"task_id"]
    task = test_tasks.loc[test_tasks['id'] == task_id].iloc[0,2] # 
    correct_example = test_tasks.loc[test_tasks['id'] == task_id].iloc[0,3]
    comment1 = train_solutions.loc[45,'author_comment']
    comment2 = train_solutions.loc[40,'author_comment']
    comment3 = train_solutions.loc[23,'author_comment']
    comment4 = train_solutions.loc[34,'author_comment']

    output = llm_chain.invoke({"task": task,
                               'correct_example': correct_example,
                               'student_solution': student_solution,
                               'comment1': comment1,
                               'comment2': comment2,
                               'comment3':comment3,
                               'comment4':comment4
                              })
    
    new_submit.loc[i, 'author_comment'] = output.content
    print(f'\n{i}.{output.content}')
    time.sleep(6)

In [None]:
new_submit

## Генерация сабмита
Модифицированный baseline

In [None]:
from typing import Callable

import torch
from transformers import BertModel, BertTokenizer

print("Loading models...", end="")
model_name = "DeepPavlov/rubert-base-cased-sentence"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
print("OK")


def get_sentence_embedding(sentence: str) -> torch.Tensor:
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :].squeeze()
    return embedding

In [None]:
def string2embedding(string: str) -> torch.Tensor:
    return torch.Tensor([float(i) for i in string.split()])


def embedding2string(embedding: torch.Tensor) -> str:
    return " ".join([str(i) for i in embedding.tolist()])

In [None]:
def _get_cosine_similarity(pred_df: pd.DataFrame, true_df: pd.DataFrame) -> float:
    predictions = pred_df["author_comment_embedding"]
    true_values = true_df["author_comment_embedding"]
    total_cos_sim = 0

    for idx in range(len(true_values)):
        pred_value = string2embedding(predictions.iloc[idx])
        gt_value = string2embedding(true_values.iloc[idx])

        if len(pred_value) != len(gt_value):
            raise ValueError(f"Embeddings have different sizes: {len(pred_value)} != {len(gt_value)}")

        cos_sim_value = cosine_similarity(pred_value.unsqueeze(0), gt_value.unsqueeze(0))
        total_cos_sim += cos_sim_value
    return float(total_cos_sim / len(true_df))


def calculate_score(submit_path: str, gt_path: str) -> float:
    submit_df = pd.read_csv(submit_path)
    true_df = pd.read_excel(gt_path)
    submit_df = submit_df[submit_df["solution_id"].isin(true_df["id"])]
    return (_get_cosine_similarity(submit_df, true_df) - 0.6) / 0.4

In [None]:
def generate_submit(test_solutions_path: str, save_path: str, use_tqdm: bool = True) -> None: #, predict_func: Callable
    test_solutions = pd.read_excel(test_solutions_path)
    bar = range(len(test_solutions))
    if use_tqdm:
        import tqdm

        bar = tqdm.tqdm(bar, desc="Predicting")

    submit_df = pd.DataFrame(columns=["solution_id", "author_comment", "author_comment_embedding"])
    for i in bar:
        idx = test_solutions.iloc[i,0]
        solution_row = test_solutions.iloc[i]

        text =  new_submit.loc[i, 'author_comment'] #predict_func(solution_row)  # here you can do absolute whatever you want

        embedding = embedding2string(get_sentence_embedding(text))
        submit_df.loc[i] = [idx, text, embedding]
    submit_df.to_csv(save_path, index=False)

In [None]:
generate_submit('/kaggle/input/hse-ai-assistant-hack/test/solutions.xlsx', 'new_submit.csv',True)