### Quickly review outputs of current model to find good samples for an eval dataset

In [1]:
import json
import random
import re
from collections import defaultdict
from json.decoder import JSONDecodeError
from typing import cast

import pandas as pd
from pydantic import BaseModel
from tqdm import tqdm

from anki_ai.domain.model import Deck, Note
from anki_ai.service_layer.services import (
    ChatCompletionService,
    get_chat_completion,
    replace_br_with_newline,
)

In [2]:
deck = Deck()

In [3]:
deck.read_txt("../data/new_deck.txt")

In [4]:
def missing_br_tag(text):
    "Not precise by any means. Need improvement."
    if "```" in text:
        if "<br>" in text:
            return False
        else:
            return True

In [5]:
orig_deck = Deck()
orig_deck.read_txt("../data/Selected Notes v7.txt")

tag_counter = defaultdict(int)

for note in deck[:100]:
    tags_str = ",".join(note.tags)
    tag_counter[tags_str] += 1

    if tag_counter[tags_str] < 3:  # add some variety
        orig_note = orig_deck.get(note.guid)[0]
        print(f"Note GUID: {note.guid}\n")
        print(
            f"Front: {orig_note.front}\nBack:  {orig_note.back}\nTags:  {orig_note.tags}\n"
        )
        print(f"Front: {note.front}\nBack:  {note.back}\nTags:  {note.tags}\n")
        print("###################\n")
    else:
        continue

Note GUID: D?H@y-%%r

Front: "<img src=""paste-d0ff77498ff8dde85ba00ae8b7c4bb6032d8483d.jpg"">"
Back:  Headboard&nbsp;
Tags:  ['english']

Front: Headboard
Back:  Headboard
Tags:  ['english']

###################

Note GUID: IjfKk}wnb@

Front: "<img src=""paste-334a3566ffa4cab66033c10810e8d06af8fda194.jpg"">"
Back:  Towel
Tags:  ['english']

Front: 
Back:  Towel
Tags:  ['english']

###################

Note GUID: Azd65{j+,q

Front: Command to create a soft link
Back:  ```bash<br>$ ln -s &lt;file&gt; &lt;link&gt;<br>```
Tags:  ['linux']

Front: Create soft link
Back:  ```bash<br>$ ln -s <file> <link><br>```
Tags:  ['linux']

###################

Note GUID: BGL!8$wV<W

Front: In the `ln -s` command, what is the order of file name and link name?
Back:  ```bash<br>$ ln -s &lt;file_name&gt; &lt;link_name&gt;<br>```
Tags:  ['linux']

Front: `ln -s` argument order
Back:  File name, then link name
Tags:  ['linux']

###################

Note GUID: uB?^NlWGiZ

Front: Command to view (but not cha

Let's create an LLM judge to be able to identify this type of errors for us.

In [6]:
SYSTEM_MSG = r"""
Your job is to evaluate Anki notes, and classify notes that are not formatted correctly.

Requirements:
* Only check formatting
* Notes should be in HTML format; for instance: newline should "<br>", "<" should be "&lt;", etc.
* Preserve images and media on the original note
* Use code block: ```<language><br><command><br>```
* Use inline code format for very short commands: `iw`, `d`, etc.

Examples of good notes:

Example 1:

    Front: Create soft link
    Back:  ```bash<br>$ ln -s <file> <link><br>```
    Tags:  ['linux']

Example 2:

    Front: Zip destination option
    Back:  ```bash<br>$ unzip <file> -d <path><br>```
    Tags:  ['linux']

Example 3:

    Front: Extract zip files
    Back:  ```bash<br>$ unzip <file><br>```
    Tags:  ['linux']

Example 4:

    Front: List directory content
    Back:  ```bash<br>$ ls <path><br>```
    Tags:  ['linux']

Examples of bad notes: 

Example 1:

    Front: Return to previous directory
    Back:  ```bash $ cd -```
    Tags:  ['linux']

    Reasoning: Missing newlines (<br> tags) in code block

Example 2: 

    Front: Remove delimiters
    Back:  ```ds <delimiter>```
    Tags:  ['nvim']

    Reasoning: Using triple backtick quotes without specifying the language and adding newlines (<br> tag) in code block

Example 3: 

    Front: Change Anki delimiters
    Back:  ```\
    Tags:  ['nvim']
    
    Reasoning: Mentioning the command is an Anki command when, in fact, it's a nvim command

Example 4: 

    Front: Text object for a sentence
    Back:  ```\
    Tags:  ['nvim']
    
    Reasoning: Missing command and not closing code block
"""

In [7]:
def review_note(note: Note, chat: ChatCompletionService) -> Note:
    user_msg = f"""Front: {note.front}\nBack: {note.back}\nTags: {note.tags}"""

    messages = [
        {"role": "system", "content": SYSTEM_MSG},
        {"role": "user", "content": user_msg},
    ]

    chat_response = chat.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
        messages=messages,  # type: ignore
        temperature=0,
    )
    result: str = cast(str, chat_response.choices[0].message.content)

    print(user_msg)
    print(f"Eval: {result}\n")
    print("#######################\n")

In [8]:
chat = get_chat_completion()
for note in deck[:10]:
    review_note(note, chat)

Front: Headboard
Back: Headboard
Tags: ['english']
Eval: This note is not formatted correctly.

Reasoning: The front and back of the note should contain different information. The front should be a question or a prompt, and the back should be the answer or the information to be remembered. In this case, both the front and back contain the same information, which is "Headboard". 

Additionally, the note is missing a code block or any other relevant information that would make it useful for memorization. 

Corrected note:

Front: What is a headboard?
Back:  ```html<br>A headboard is a piece of furniture that is placed at the head of a bed.<br>```
Tags:  ['english']

#######################

Front: 
Back: Towel
Tags: ['english']
Eval: This note is not formatted correctly.

Reasoning: 
- The front and back of the note should be in HTML format, with the front being a question or a prompt and the back being the answer or the information to be remembered.
- The back of the note is a single wo

In [9]:
class Review(BaseModel):
    guid: str
    is_correct: bool
    reasoning: str


def review_note(note: Note, chat: ChatCompletionService, verbose=False) -> Note:
    user_msg = f"""Front: {note.front}\nBack: {note.back}\nTags: {note.tags}"""

    messages = [
        {"role": "system", "content": SYSTEM_MSG},
        {"role": "user", "content": user_msg},
    ]
    extra_body = {
        "guided_json": Review.model_json_schema(),
        "guided_whitespace_pattern": r"[\n\t ]*",
    }

    chat_response = chat.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
        messages=messages,  # type: ignore
        temperature=0,
        extra_body=extra_body,
    )
    content_str: str = cast(str, chat_response.choices[0].message.content)
    try:
        content_dict = json.loads(content_str)
        content_dict["guid"] = note.guid
        updated_content_str = json.dumps(content_dict)
        result = Review.model_validate_json(updated_content_str)

        if verbose:
            print(user_msg)
            print(f"Eval: {result}\n")
            print("#######################\n")

        return result
    except JSONDecodeError as e:
        print(e)

In [10]:
chat = get_chat_completion()
correct_cnt = 0

n = 200
results = []

l = list(deck)
random.shuffle(l)
for note in tqdm(l[:n]):
    result = review_note(note, chat)
    results.append(result)

    if result.is_correct:
        correct_cnt += 1

print(f"{correct_cnt/n:.2%} correct")

100%|██████████████████████████████████████████████| 200/200 [02:56<00:00,  1.13it/s]

56.50% correct





In [11]:
dict_data = [item.dict() for item in results]
df_scores = pd.DataFrame(dict_data)
df_scores.head()

Unnamed: 0,guid,is_correct,reasoning
0,"H-u[y)GJ1,",False,Missing newlines (<br> tags) in code block
1,hQ8ss+MV9X,True,
2,o.bB)eu](<,True,
3,bB:Kyv*1Es,True,
4,DYW.*!vn5h,False,Missing newlines (<br> tags) in code block


In [12]:
a = [note.dict() for note in deck]
df_notes = pd.DataFrame(a)
df_notes.head()

Unnamed: 0,guid,front,back,tags,notetype,deck_name
0,D?H@y-%%r,Headboard,Headboard,[english],KaTeX and Markdown Basic (Color),Default
1,IjfKk}wnb@,,Towel,[english],KaTeX and Markdown Basic (Color),Default
2,"""G1Z_~#;mLc""",,Jug,[english],KaTeX and Markdown Basic (Color),Default
3,"Azd65{j+,q",Create soft link,```bash<br>$ ln -s <file> <link><br>```,[linux],KaTeX and Markdown Basic (Color),Default
4,BGL!8$wV<W,`ln -s` argument order,"File name, then link name",[linux],KaTeX and Markdown Basic (Color),Default


In [13]:
x = pd.merge(df_notes, df_scores, how="inner", on="guid")
x = x[x.tags.apply(lambda a: "life" not in a)]
print(x.shape)
x.head(50)

(196, 8)


Unnamed: 0,guid,front,back,tags,notetype,deck_name,is_correct,reasoning
0,z!DVV]7Ab_,Set file permissions flag,```bash<br>$ chmod <mode> <file><br>```,[linux],KaTeX and Markdown Basic (Color),Default,True,
1,Gh34afD.qb,Delay process,```bash<br>$ sleep <time><br>```,[linux],KaTeX and Markdown Basic (Color),Default,True,
2,"O`qIqf,Pdf",Return to previous directory,```bash<br>$ cd -<br>```,[linux],KaTeX and Markdown Basic (Color),Default,True,
3,t~<4Xq:P|Q,Cosine similarity formula,$ \\[cos(\theta) = {\mathbf{A} \cdot \mathbf{B...,"[leech, linear-algebra]",KaTeX and Markdown Basic (Color),Default,False,Missing code block and newlines (<br> tags) in...
4,"""D#(>QA!lKJ""",Manhattan distance norm,L1-norm,[math],KaTeX and Markdown Basic (Color),Default,True,
5,"""mV[I%O(Z#b""",Vector length via Euclidean norm,The length of a vector can be computed by usin...,[geometry],KaTeX and Markdown Cloze (Color),Default,False,Missing code block and newlines (<br> tags)
6,q[Gg^irw5-,Goal of boosting,Reduce bias and variance,[ml],KaTeX and Markdown Basic (Color),Default,True,
7,jn2![C{4rn,When probability is small,Odds ≈ Probability,[probability],KaTeX and Markdown Cloze (Color),Default,True,
8,rJ!PVSQ/>l,Remove delimiters,```ds <delimiter>```,[nvim],KaTeX and Markdown Basic (Color),Default,False,Using triple backtick quotes without specifyin...
9,"""E$=Vf#>Cq*""",Parentheses delimiter spacing,"`(`, `{`, `[`",[nvim],KaTeX and Markdown Basic (Color),Default,False,Missing newlines (<br> tags) in code block


In [14]:
def validate_interactive_session(session_text):
    lines = session_text.strip().split("\n")
    input_pattern = r"^>>> .*$"
    continuation_pattern = r"^... .*$"
    output_pattern = r"^(?!>>>)(?!\.\.\.)"

    state = "expecting_input"
    for i, line in enumerate(lines, 1):
        if state == "expecting_input":
            if not (
                re.match(input_pattern, line) or re.match(continuation_pattern, line)
            ):
                return False, f"Line {i}: Expected input (>>> or ...), got: {line}"
            state = "optional_output"
        elif state == "optional_output":
            if re.match(input_pattern, line) or re.match(continuation_pattern, line):
                state = "expecting_input"
            elif not re.match(output_pattern, line):
                return False, f"Line {i}: Invalid output format: {line}"

    return True, "Valid interactive session format"


def validate_hybrid_markdown(content):
    issues = []

    # Check for double backslashes in LaTeX blocks
    latex_blocks = re.findall(r"\$(.*?)\$", content, re.DOTALL)
    for block in latex_blocks:
        if "\\\\" in block:
            issues.append(
                "Double backslash (\\\\) found in LaTeX block. This may cause rendering issues."
            )

    # Check for unmatched dollar signs
    # Split the content into code blocks and non-code blocks
    parts = re.split(r"(```[\s\S]*?```)", content)

    total_dollar_count = 0
    for part in parts:
        if part.startswith("```") and part.endswith("```"):
            # This is a code block, don't count its dollar signs
            continue
        else:
            # Count dollar signs in non-code block parts
            dollar_count = part.count("$")
            total_dollar_count += dollar_count

    for part in parts:
        if part.startswith("```") and part.endswith("```"):
            # This is a code block
            if part.startswith("```python"):
                # Check if it's an interactive Python session
                session_content = part[9:-3].strip()  # Remove ```python and ```
                is_valid, message = validate_interactive_session(
                    replace_br_with_newline(session_content)
                )
                if not is_valid:
                    issues.append(
                        f"Invalid Python interactive session in code block: {message}"
                    )

    # Check if the total number of dollar signs outside code blocks is odd
    if total_dollar_count % 2 != 0:
        issues.append(
            "Unmatched dollar signs outside code blocks. LaTeX may not render correctly."
        )

    # Check for common Markdown syntax errors
    if "```" in content and content.count("```") % 2 != 0:
        issues.append(
            "Unmatched code block delimiters (```). Code blocks may not render correctly."
        )

    # Add more checks as needed...

    return issues

In [15]:
n_reviews = 10

for row in x.iloc[:n_reviews].iterrows():
    note = row[1]
    print(f"Front: {note['front']}\nBack: {note['back']}\nTags: {note['tags']}")
    for side in ["front", "back"]:
        a = note[side]
        issues = validate_hybrid_markdown(a)
        if issues:
            for issue in issues:
                print(f"Issue {side}: {issue}")
        else:
            print(f"Issue {side}: None")
    print("\n")

Front: Set file permissions flag
Back: ```bash<br>$ chmod <mode> <file><br>```
Tags: ['linux']
Issue front: None
Issue back: None


Front: Delay process
Back: ```bash<br>$ sleep <time><br>```
Tags: ['linux']
Issue front: None
Issue back: None


Front: Return to previous directory
Back: ```bash<br>$ cd -<br>```
Tags: ['linux']
Issue front: None
Issue back: None


Front: Cosine similarity formula
Back: $ \\[cos(\theta) = {\mathbf{A} \cdot \mathbf{B} \over \|\mathbf{A}\| \|\mathbf{B}\|} \\$
Tags: ['leech', 'linear-algebra']
Issue front: None
Issue back: Double backslash (\\) found in LaTeX block. This may cause rendering issues.


Front: Manhattan distance norm
Back: L1-norm
Tags: ['math']
Issue front: None
Issue back: None


Front: Vector length via Euclidean norm
Back: The length of a vector can be computed by using the Euclidean norm.
Tags: ['geometry']
Issue front: None
Issue back: None


Front: Goal of boosting
Back: Reduce bias and variance
Tags: ['ml']
Issue front: None
Issue back:

### Todo

- [x] Create a helper function to validate code blocks and LaTeX formatting
- [ ] Create a dataset to measure LLM judge's alignment with human preference 
- [ ] Use _reflection_ agentic workflow to improve notes