In [1]:
import time
import openai
from sentence_transformers import SentenceTransformer, util
import torch
from dotenv import load_dotenv
import os
import pandas as pd
import pandas.io.formats.style
from transformers import AutoModel, AutoTokenizer
import chromadb
import tiktoken
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from bs4 import BeautifulSoup
import requests
import re
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel
from typing import List
from langchain.prompts import ChatPromptTemplate
# from langchain.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI
import torch.nn.functional as F
import sqlite3
from sentence_transformers import CrossEncoder






In [2]:
PATH_CHROMA = "../../chroma_db"
client = chromadb.PersistentClient(path=PATH_CHROMA)
collection = client.get_or_create_collection(name="interview_data")
embedding_model = "Snowflake/snowflake-arctic-embed-l-v2.0"
sentence_model = SentenceTransformer(embedding_model).to(torch.device("cuda"))
load_dotenv()
OPEN_AI_API_KEY = os.getenv("OPENAI_API_KEY")
openai.api_key = OPEN_AI_API_KEY


In [None]:
# query = "append is a method used to add elements to the list in python. It adds element on the end of the list"
# query_wrong = "append is a method which adds element at the front of the list"

# results = collection.query(
#     query_embeddings=[sentence_model.encode(query, convert_to_numpy=True)],
#     n_results=3
# )

# query = "append is a method used to add elements to the list in python."
# query_wrong = "append which is used to print elements to the console"

# query_emb = sentence_model.encode(query, convert_to_tensor=True)
# query_wrong_emb = sentence_model.encode(query_wrong, convert_to_tensor=True)
# doc_embs = sentence_model.encode(results["documents"][0], convert_to_tensor=True)

# similarities = util.cos_sim(query_emb, doc_embs)
# similarities_wrong = util.cos_sim(query_wrong_emb, doc_embs)

# for i, (score, score_wrong) in enumerate(zip(similarities[0], similarities_wrong[0])):
#     print(f"Dokument {i+1}: Similarity = {score.item():.4f}, Similarity wrong = {score_wrong.item():.4f}")

In [3]:
class MetaDatas(BaseModel):
    difficulty: str
    type_question: str
    question: str

In [4]:
def get_ideal_answer(difficulty, type_question, question, user_answer):
    metadatas = MetaDatas(difficulty=difficulty, type_question=type_question, question=question)

    results = collection.query(
        query_embeddings=[sentence_model.encode(user_answer, convert_to_numpy=True)],
        n_results=3,
        where={
            "question": metadatas.question
        },
        include=["embeddings", "documents", "metadatas", "distances"]
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # answer_emb = sentence_model.encode(user_answer, convert_to_tensor=True).to(device)
    # doc_embs_list = results["embeddings"][0]
    # doc_embs = torch.tensor(doc_embs_list, dtype=torch.float32, device=device)
    # similarities = util.cos_sim(answer_emb, doc_embs)
    # best_index = torch.argmax(similarities).item()

    # ideal_answer = results["documents"][0][best_index]


    # return ideal_answer, similarities[0][best_index]

    ideal_answer = results["documents"][0][0]
    return ideal_answer, torch.tensor([1.0])


In [5]:
def get_data_about_answer(ideal_answer, cosine, question, answer, result):
    print("question: " + question)
    print("answer: " + answer)
    print("ideal_answer: " + ideal_answer)
    print(f"cosine: {cosine.item()}")
    print(f"grade: {result.grade}")
    print("explanation: " + result.explanation_of_grade)
    print("follow up: " + result.follow_up_question)

In [6]:
question = "What is catastrophic forgetting in fine-tuning LLMs?"
user_answer = "overfitting happens if model fits to well to training data"
ideal_answer, cosine_similarity = get_ideal_answer("Easy", "llm", question, user_answer)

In [7]:
print(ideal_answer, cosine_similarity)

It is a phenomenon where a model loses the ability to perform well on previously learned tasks after being fine-tuned on new tasks. tensor([1.])


In [8]:
SYSTEM_MESSAGE = "You are an interviewer who checks person's knowledge in LLM."


In [9]:
class MessageHistory(BaseModel):
    role: str
    content: str

class RedisMock:
    def __init__(self):
        self.messages = []
        self.introduction_message = []

    def add_message(self, role, content):
        message = MessageHistory(role=role, content=content)
        self.messages.append(message)

    def add_introduction_message(self, role, content):
        message = MessageHistory(role=role, content=content)
        self.introduction_message.append(message)

    def get_all_messages(self):
        return self.introduction_message + self.messages
    
    def get_history(self):
        return "\n".join(f"{m.role}: {m.content}" for m in self.get_all_messages())

    def get_recent_messages(self, limit=5):
        return self.get_all_messages()[-limit:]

    def print_last(self):
        print(f"{self.get_all_messages()[-2].role}: {self.get_all_messages()[-2].content}")
        print(f"{self.get_all_messages()[-1].role}: {self.get_all_messages()[-1].content}")

    def clear_messages(self):
        self.messages = []

    def __str__(self):
        history = ""
        for message in self.get_all_messages():
            history += f"{message.role}: {message.content} \n"

        return history
    


### 1. Start interview

### 1.1 Introduction

In [10]:
def build_prompt_with_instruction(
        history_obj: RedisMock,
        current_user_prompt: str,
        format_instruction: str,
        limit: int = 10,
) -> ChatPromptTemplate:
    chat: list[tuple[str, str]] = [("system", SYSTEM_MESSAGE)]

    recent = history_obj.get_recent_messages(limit=limit)
    chat.extend((m.role, m.content) for m in recent)

    chat.append(("user", current_user_prompt))
    chat.append(("system", format_instruction))

    return ChatPromptTemplate.from_messages(chat)

class StartInterviewModel(BaseModel):
    introduction: str

parser = PydanticOutputParser(pydantic_object=StartInterviewModel)
format_instructions_start_interview = parser.get_format_instructions().replace("{", "{{").replace("}", "}}")

redisMock = RedisMock()

START_INTERVIEW_INSTRUCTION = f"Return the result strictly in this JSON format: \n{format_instructions_start_interview}"

prompt_human = f"""
        Start interview by introducing yourself that you are interviewer in Interview Better company.
     """

redisMock.add_introduction_message("user", prompt_human)
prompt = build_prompt_with_instruction(redisMock, prompt_human, START_INTERVIEW_INSTRUCTION)
start_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1, api_key=OPEN_AI_API_KEY) 
chain = prompt | start_llm | parser
result = chain.invoke({})


redisMock.add_introduction_message("assistant", result.introduction)

In [157]:

# prompt_human = f"""
#         Start interview by introducing yourself that you are interviewer in Interview Better company.
#      """

# redisMock.add_introduction_message("user", prompt_human)
# prompt = build_prompt_with_instruction(redisMock, prompt_human, START_INTERVIEW_INSTRUCTION)
# start_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1, api_key=OPEN_AI_API_KEY) 
# chain = prompt | start_llm | parser
# result = chain.invoke({})


# redisMock.add_introduction_message("assistant", result.introduction)

In [11]:
print(redisMock)

user: 
        Start interview by introducing yourself that you are interviewer in Interview Better company.
      
assistant: Hello, my name is [Your Name], and I am an interviewer at Interview Better company. Today, I will be assessing your knowledge in the field of Large Language Models (LLMs). 



### 1.2 Finding question to ask in interview

In [2]:
PATH_DB = "../../documents.db"

def get_random_questions_by_type(search_question_type, limit=10):
    conn = sqlite3.connect(PATH_DB)
    cursor = conn.cursor()

    cursor.execute(f"""
        SELECT question FROM documents
        WHERE type_question LIKE ?
        ORDER BY RANDOM()
        LIMIT {limit};
    """, (f"%{search_question_type}%", ))

    results = cursor.fetchall()
    conn.close()

    return results

In [13]:
questions = [question[0] for question in get_random_questions_by_type("llm")]

In [None]:
get_random_questions_by_type("llm")

In [14]:
print(questions)

['How does Chain-of-Thought (CoT) prompting improve reasoning in LLMs?', 'How can LLMs mitigate catastrophic forgetting during fine-tuning?', 'How does gradient checkpointing reduce memory usage in training large LLMs?', 'How does prompt engineering influence LLM performance, and what strategies can be used to optimize it?', 'What are the key steps involved in the Retrieval-Augmented Generation (RAG) pipeline?', 'What is the role of contextual embeddings in LLMs, and how do they differ from static embeddings?', 'How does beam search improve upon greedy decoding in LLMs?', 'What is catastrophic forgetting in fine-tuning LLMs?', 'How does meta-learning benefit LLMs?', 'What is knowledge distillation, and how is it used in LLMs?']


### 2. Continue Interview

In [15]:
class GradedAnswer(BaseModel):
    grade: int = 0
    explanation_of_grade: str = ""
    follow_up_question: str = ""


parser_graded = PydanticOutputParser(pydantic_object=GradedAnswer)
format_instructions_graded = parser_graded.get_format_instructions().replace("{", "{{").replace("}", "}}")

In [16]:
FINAL_INSTRUCTION = f"""
Return the result strictly in this JSON format:

{format_instructions_graded}

Make sure the JSON object includes exactly these fields:
- "grade" (integer)
- "explanation_of_grade" (string)
- "follow_up_question" (string)""".strip()

EVALUATION_INSTRUCTION = """
Evaluate user's answer on the question from 1 to 10, based on the user answer, ideal answer and cosine similarity calculated between both.
If user's answer does not contain whole information about ideal answer, provide follow-up question to suggest what is missing in the answer.
If user's answer contain whole information provide in follow_up_question field: \"DONE\".
If user answers that he doesn't know, also put \"DONE\".
Grade user better if his voice emotion is positive.
Remember to keep the JSON format.
""".strip()


In [17]:
print(redisMock)

user: 
        Start interview by introducing yourself that you are interviewer in Interview Better company.
      
assistant: Hello, my name is [Your Name], and I am an interviewer at Interview Better company. Today, I will be assessing your knowledge in the field of Large Language Models (LLMs). 



In [None]:
class ConversationController:
    def __init__(self, model_name: str = 'cross-encoder/ms-marco-MiniLM-L-6-v2', threshold: float = -1):
        self.reranker = CrossEncoder(model_name)
        self.threshold = threshold

        self.pending_follow_up: str | None = None
        self.missed_in_a_row: int = 0
        self.count_followups: int = 0

    def reset_conversation(self):
        self.missed_in_a_row = 0
        self.count_followups = 0
        self.pending_follow_up = None

    def register_assistant(self, follow_up_q: str):
        if follow_up_q == "DONE":
            self.pending_follow_up = None
            self.missed_in_a_row = 0
        else:
            self.pending_follow_up = follow_up_q
        
        self.count_followups += 1

    def register_user(self, user_ans: str) -> bool:
        if self.count_followups >= 5:
            return True

        if self.pending_follow_up:
            if not self._is_related(user_ans, self.pending_follow_up):
                self.missed_in_a_row += 1
            else:
                self.pending_follow_up = None
                self.missed_in_a_row = 0

        if self.missed_in_a_row >= 2:
            self.pending_follow_up = None
            self.missed_in_a_row = 0
            return True 
        return False

    def _is_related(self, user_ans: str, follow_up_q: str) -> bool:
        score = self.reranker.predict([(follow_up_q, user_ans)])[0]
        print(user_ans, follow_up_q)
        print(score)
        print(f"Re-ranker score: {score:.3f}")
        return score >= self.threshold
    


In [20]:
# controller = ConversationController()



In [21]:
# question = "Do you need a vector store for all text-based LLM use cases?"
# user_answer = "A"

# controller.register_assistant(question)
# controller.register_user(user_answer)



In [22]:
# controller.missed_in_a_row

In [19]:
def add_grade_information(graded_result: GradedAnswer):
    return f"""
        grade: {graded_result.grade}
        explanation of this grade: {graded_result.explanation_of_grade}
        follow up question: {graded_result.follow_up_question}
    """


def conversate_llm(
        question: str,
        follow_up_question: str | None,
        user_answer: str,
        ideal_answer: str,
        emotion: str,
        history_limit: int = 10
) -> GradedAnswer:
    key_information = (
        f"Original question: {question}\n"
        f"Follow up question (if is not None then user answers to this question): {follow_up_question}\n"
        f"User answer: {user_answer}\n"
        f"Ideal answer: {ideal_answer}\n"
        f"User's emotion: {emotion}\n"
    )
    redisMock.add_message("user", key_information)
    current_prompt_human = f"{EVALUATION_INSTRUCTION}\n\n{key_information}"

    prompt = build_prompt_with_instruction(
        history_obj=redisMock,
        current_user_prompt=current_prompt_human,
        format_instruction=FINAL_INSTRUCTION,
        limit=history_limit,
    )

    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1, api_key=OPEN_AI_API_KEY)
    chain = prompt | llm | parser_graded
    graded: GradedAnswer = chain.invoke({})

    redisMock.add_message("assistant", add_grade_information(graded))

    return graded

In [20]:
class FinalGrade(BaseModel):
    grade: int = 0
    feedback: str = ""


parser_final_grade = PydanticOutputParser(pydantic_object=FinalGrade)
format_instructions_graded = parser_final_grade.get_format_instructions().replace("{", "{{").replace("}", "}}")

SUMMARIZE_INSTRUCTION = f"""You summarize interview process. 
You want to give user feedback about the interview process
User wants to get feedback about what he should learn to become better at interview process
You will be given entire process of interview asking question and user answering it.
User's first answer is directly answering to Original question and then he is answering follow up questions.
You will have access to grade of assistant and explanations of his grade.
In field grade provide final grade,
In feedback provide whole feedback about user responses.

{format_instructions_graded}
""".strip()

def create_final_grade() -> FinalGrade:
    messages = redisMock.get_history()
    prompt_template = ChatPromptTemplate.from_messages([
        ("system", "{instruction}\n\n{history}")
    ])
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1, api_key=OPEN_AI_API_KEY)
    chain = prompt_template | llm | parser_final_grade
    graded: FinalGrade = chain.invoke({
        "instruction": SUMMARIZE_INSTRUCTION,
        "history": messages
    })
    redisMock.clear_messages()

    return graded


In [41]:
# question = "Do you need a vector store for all text-based LLM use cases?"

# user_answer = "It is used to prevent overfitting in algorithm"
# ideal_answer, cosine_similarity = get_ideal_answer("Easy", "llm", question, user_answer)
# emotion = "happy"

# graded_result = conversate_llm(question, None, user_answer, ideal_answer, emotion)

# result = chain.invoke({
#     "question": question,
#     "user_answer": user_answer,
#     "ideal_answer": ideal_answer,
#     "emotion": "happy",
# })

# redisMock.print_last()

In [42]:
# print(redisMock)

In [22]:
# question = "Do you need a vector store for all text-based LLM use cases?"
# user_answer = "For chatbots or conversational agents that rely on predefined rule-based responses, a vector store might not be necessary. It is useful if we want our llm to provide answers based on some specific documents. As an alternative for vector databases we can use decision trees. It is very useful for retrieving information from a large text corpus because we can semantically find best answer for our prompts instead of using basic text NLP algorithms"
# ideal_answer, cosine_similarity = get_ideal_answer("Easy", "llm", question, user_answer)
# emotion = "happy"

# graded_result = conversate_llm(question, user_answer, ideal_answer, emotion)

# result = chain.invoke({
#     "question": question,
#     "user_answer": user_answer,
#     "ideal_answer": ideal_answer,
#     "emotion": "happy",
    
# })
controller = ConversationController()
emotion = "happy"

for question in questions:
    original_question = question
    graded_answer = GradedAnswer()
    controller.reset_conversation()
    controller.register_assistant(question)
    user_answer = input(question)
    
    if user_answer == "":
        break


    if controller.register_user(user_answer):
        break

    ideal_answer, cosine_similarity = get_ideal_answer("Easy", "llm", question, user_answer)
    graded_answer = conversate_llm(question, None, user_answer, ideal_answer, emotion)

    print(f"feedback: {graded_answer.explanation_of_grade}")
    print(f"followup question: {graded_answer.follow_up_question}")

    follow_up_active = True
    should_break = False
    follow_up_question = graded_answer.follow_up_question

    while follow_up_active:
        controller.register_assistant(follow_up_question)
        user_answer = input(follow_up_question)
        
        if user_answer == "":
            should_break = True
            break

        if controller.register_user(user_answer):
            break

        graded_answer = conversate_llm(original_question, follow_up_question, user_answer, ideal_answer, emotion)

        print(f"grade: {graded_answer.grade} feedback: {graded_answer.explanation_of_grade}")
        print(f"followup question: {graded_answer.follow_up_question}")

        if graded_answer.follow_up_question.strip().upper() == "DONE":
            follow_up_active = False
            print("-----------------------------\n")
        else:
            follow_up_question = graded_answer.follow_up_question

    print("Finished asking follow ups")
    finalGrade: FinalGrade = create_final_grade()
    print(f"This is final grade: {finalGrade.grade}")
    print(f"Interviewer feedback: {finalGrade.feedback}")
    print(50 * "-")

    if should_break == True:
        break




It improves LLMs by specifying them how should it think, so by that model knew what the user specifically wanted How does Chain-of-Thought (CoT) prompting improve reasoning in LLMs?
-0.18216534
Re-ranker score: -0.182
feedback: The user's answer captures the essence of CoT prompting by mentioning that it specifies how the model should think. However, it lacks details about generating intermediate steps, understanding logical flow, and identifying errors in reasoning, which are crucial aspects of the ideal answer.
followup question: Can you explain how generating intermediate steps helps in understanding the logical flow of information?
If model knew step by step what it should do, so it would do specifically what user wanted Can you explain how generating intermediate steps helps in understanding the logical flow of information?
-10.359896
Re-ranker score: -10.360
Finished asking follow ups
This is final grade: 6
Interviewer feedback: Your answer provided a basic understanding of how C

In [218]:
get_data_about_answer(ideal_answer, cosine_similarity, question, user_answer, graded_result)

question: Do you need a vector store for all text-based LLM use cases?
answer: For chatbots or conversational agents that rely on predefined rule-based responses, a vector store might not be necessary. It is useful if we want our llm to provide answers based on some specific documents. As an alternative for vector databases we can use decision trees. It is very useful for retrieving information from a large text corpus because we can semantically find best answer for our prompts instead of using basic text NLP algorithms
ideal_answer: For chatbots or conversational agents that rely on predefined rule-based responses, a vector store might not be necessary. Instead, these systems often use decision trees or intent recognition to guide responses. Nonetheless, if the chatbot needs to retrieve information from a large text corpus dynamically, integrating a vector store could improve its capabilities.
cosine: 1.0
grade: 8
explanation: The user's answer contains most of the key points from th

In [219]:
print(redisMock)

user: 
        Start interview by introducing yourself that you are interviewer in Interview Better company.
      
assistant: Hello, I am the interviewer from Interview Better company. Today, I will be assessing your knowledge in large language models. 
user: 
        Evaluate user's answer on the question from 1 to 10, based on the user answer, ideal answer and cosine similarity calculated between both.
        If user's answer does not contain whole information about ideal answer, provide follow-up question to suggest what is missing in the answer.
        If user's answer contain whole information provide in follow_up_question field: "DONE"
        Grade user better if his voice emotion is positive.
        Remember to keep the JSON format.
    
        Question: Do you need a vector store for all text-based LLM use cases?
        User answer: For chatbots or conversational agents that rely on predefined rule-based responses, a vector store might not be necessary. It is useful if w

### 3. Finalize Interview