### Quickly review outputs of current model to find good samples for an eval dataset

In [1]:
from anki_ai.domain.model import Deck
import pandas as pd
from collections import defaultdict
from anki_ai.domain.model import Note
from anki_ai.service_layer.services import get_chat_completion, ChatCompletionService, strip_tags
from typing import cast
from tqdm import tqdm
from pydantic import BaseModel


In [2]:
deck = Deck()

In [3]:
deck.read_txt("../data/new_deck.txt")

In [4]:
def missing_br_tag(text):
    "Not precise by any means. Need improvement."
    if "```" in text: 
        if "<br>" in text:
            return False
        else:
            return True

In [5]:
orig_deck = Deck()
orig_deck.read_txt("../data/Selected Notes v7.txt")



tag_counter = defaultdict(int)

for note in deck[:100]:
    tags_str = ",".join(note.tags)
    tag_counter[tags_str] += 1
    
    if tag_counter[tags_str] < 3:  # add some variety
        orig_note = orig_deck.get(note.guid)[0]
        print(f"Note GUID: {note.guid}\n")
        print(f"Front: {orig_note.front}\nBack:  {orig_note.back}\nTags:  {orig_note.tags}\n")
        print(f"Front: {note.front}\nBack:  {note.back}\nTags:  {note.tags}\n")
        print("###################\n")
    else:
        continue

Note GUID: D?H@y-%%r

Front: "<img src=""paste-d0ff77498ff8dde85ba00ae8b7c4bb6032d8483d.jpg"">"
Back:  Headboard&nbsp;
Tags:  ['english']

Front: Headboard
Back:  Headboard
Tags:  ['english']

###################

Note GUID: IjfKk}wnb@

Front: "<img src=""paste-334a3566ffa4cab66033c10810e8d06af8fda194.jpg"">"
Back:  Towel
Tags:  ['english']

Front: 
Back:  Towel
Tags:  ['english']

###################

Note GUID: Azd65{j+,q

Front: Command to create a soft link
Back:  ```bash<br>$ ln -s &lt;file&gt; &lt;link&gt;<br>```
Tags:  ['linux']

Front: Create soft link
Back:  ```bash<br>$ ln -s <file> <link><br>```
Tags:  ['linux']

###################

Note GUID: BGL!8$wV<W

Front: In the `ln -s` command, what is the order of file name and link name?
Back:  ```bash<br>$ ln -s &lt;file_name&gt; &lt;link_name&gt;<br>```
Tags:  ['linux']

Front: `ln -s` argument order
Back:  File name, then link name
Tags:  ['linux']

###################

Note GUID: uB?^NlWGiZ

Front: Command to view (but not cha

Let's create an LLM judge to be able to identify this type of errors for us.

In [6]:
SYSTEM_MSG = r"""
Your job is to evaluate Anki notes, and classify notes that are not formatted correctly.

Requirements:
* Only check formatting
* Notes should be in HTML format; for instance: newline should "<br>", "<" should be "&lt;", etc.
* Preserve images and media on the original note
* Use code block: ```<language><br><command><br>```
* Use inline code format for very short commands: `iw`, `d`, etc.

Examples of good notes:

Example 1:

    Front: Create soft link
    Back:  ```bash<br>$ ln -s <file> <link><br>```
    Tags:  ['linux']

Example 2:

    Front: Zip destination option
    Back:  ```bash<br>$ unzip <file> -d <path><br>```
    Tags:  ['linux']

Example 3:

    Front: Extract zip files
    Back:  ```bash<br>$ unzip <file><br>```
    Tags:  ['linux']

Example 4:

    Front: List directory content
    Back:  ```bash<br>$ ls <path><br>```
    Tags:  ['linux']

Examples of bad notes: 

Example 1:

    Front: Return to previous directory
    Back:  ```bash $ cd -```
    Tags:  ['linux']

    Reasoning: Missing newlines (<br> tags) in code block

Example 2: 

    Front: Remove delimiters
    Back:  ```ds <delimiter>```
    Tags:  ['nvim']

    Reasoning: Using triple backtick quotes without specifying the language and adding newlines (<br> tag) in code block

Example 3: 

    Front: Change Anki delimiters
    Back:  ```\
    Tags:  ['nvim']
    
    Reasoning: Mentioning the command is an Anki command when, in fact, it's a nvim command

Example 4: 

    Front: Text object for a sentence
    Back:  ```\
    Tags:  ['nvim']
    
    Reasoning: Missing command and not closing code block
"""

In [7]:


def review_note(note: Note, chat: ChatCompletionService) -> Note:
    user_msg = f"""Front: {note.front}\nBack: {note.back}\nTags: {note.tags}"""

    messages = [
        {"role": "system", "content": SYSTEM_MSG},
        {"role": "user", "content": user_msg},
    ]

    chat_response = chat.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
        messages=messages,  # type: ignore
        temperature=0,
    )
    result: str = cast(str, chat_response.choices[0].message.content)

    print(user_msg)
    print(f"Eval: {result}\n")
    print("#######################\n")

In [8]:
chat = get_chat_completion()
for note in deck[:10]:
    review_note(note, chat)

Front: Headboard
Back: Headboard
Tags: ['english']
Eval: This note is not formatted correctly.

Reasoning: The front and back of the note should contain different information. The front should be a question or a prompt, and the back should be the answer or the information to be remembered. In this case, both the front and back contain the same information, which is "Headboard". 

Additionally, the note is missing a code block or any other relevant information that would make it useful for memorization. 

Corrected note:

Front: What is a headboard?
Back:  ```html<br>A headboard is a piece of furniture that is placed at the head of a bed.<br>```
Tags:  ['english']

#######################

Front: 
Back: Towel
Tags: ['english']
Eval: This note is not formatted correctly.

Reasoning: 
- The front and back of the note should be in HTML format, with the front being a question or a prompt and the back being the answer or the information to be remembered.
- The back of the note is a single wo

In [20]:

class Review(BaseModel):
    is_correct: bool
    reasoning: str

def review_note(note: Note, chat: ChatCompletionService, verbose=False) -> Note:
    user_msg = f"""Front: {note.front}\nBack: {note.back}\nTags: {note.tags}"""

    messages = [
        {"role": "system", "content": SYSTEM_MSG},
        {"role": "user", "content": user_msg},
    ]
    extra_body = {
        "guided_json": Review.model_json_schema(),
        "guided_whitespace_pattern": r"[\n\t ]*",
    }

    chat_response = chat.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
        messages=messages,  # type: ignore
        temperature=0,
        extra_body=extra_body,
    )
    json_data: str = cast(str, chat_response.choices[0].message.content)
    result = Review.model_validate_json(json_data)

    if verbose:
        print(user_msg)
        print(f"Eval: {result}\n")
        print("#######################\n")

    return result

In [79]:
chat = get_chat_completion()
correct_cnt = 0

n = 1_000
for note in tqdm(deck[:n]):
    result = review_note(note, chat)

    if result.is_correct:
        correct_cnt += 1

print(f"{correct_cnt/n:.2%} correct")

100%|█████████████████████████████████████████████| 1000/1000 [12:12<00:00,  1.36it/s]

45.50% correct





### Todo

- [ ] Align LLM judge with human preference
- [ ] Use _reflection_ agentic workflow to improve notes