In [1]:
import sys

from loguru import logger

from anki_ai.domain.model import Deck

In [2]:
logger.remove()
logger.add(sys.stderr, level="ERROR")

1

When exporting notes from Anki, select the following options:
- Notes in plain text (.txt)
- Include HTML and media references
- Include tags
- Include deck name
- Include notetype name
- Include unique identifier

In [3]:
deck = Deck("default")
deck.read_txt("../data/Selected Notes.txt")
print(f"The deck contains {len(deck)} notes")

The deck contains 2241 notes


One issue we need to handle is that our notes contain HTML and media reference, which can confuse the LLM when editing the note. Here is an example:

In [4]:
deck[0].back

'```bash<br>$ ln -s &lt;file_name&gt; &lt;link_name&gt;<br>```'

In [5]:
from html.parser import HTMLParser
from io import StringIO


class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.text = StringIO()

    def handle_data(self, d):
        self.text.write(d)

    def get_data(self):
        return self.text.getvalue()


def replace_br_with_newline(html_string):
    import re

    return re.sub(r"<br\s*/?>", "\n", html_string)


def strip_tags(html):
    s = MLStripper()
    s.feed(replace_br_with_newline(html))
    return s.get_data()

In [6]:
back_card = deck[0].back
print(f"Original note:\n{back_card}\n")
print(f"Fixed:\n{strip_tags(back_card)}")

Original note:
```bash<br>$ ln -s &lt;file_name&gt; &lt;link_name&gt;<br>```

Fixed:
```bash
$ ln -s <file_name> <link_name>
```


In [7]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_id = "microsoft/Phi-3-medium-4k-instruct"
# model_id = "microsoft/Phi-3-small-8k-instruct"
# model_id = "mistralai/Mistral-7B-Instruct-v0.2"
# model_id ="Qwen/Qwen2-7BInstruct"
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id)

In [8]:
# ?model.generation_config

In [9]:
# system_msg = """You are an expert in building Anki cards.

# The user will share an existing Anki note with you. Your job is to edit it to be more concise, simple, straightforward, and distinct.

# Do not describe the changes you made. Reply in this format:

# Front: [front section of card 1]
# Back: [back section of card 1]

# When providing commands to be executed in a terminal, please prefix each command with the $ symbol. This helps distinguish executable commands from output or other text. For example:
# $ ls -l
# $ cd /path/to/directory
# $ python script.py"

# Note that the $ symbol represents the command prompt and should not be typed when actually executing the commands.
# """

# for i in range(10):
#     user_msg = f"""Front: {deck[i].front}
#     Back: {deck[i].back}
#     """

#     messages = [
#         {"role": "system", "content": system_msg},
#         {"role": "user", "content": user_msg},
#     ]

#     tokenized_chat = tokenizer.apply_chat_template(
#         messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
#     )
#     new_tokens = model.generate(tokenized_chat, max_new_tokens=150)
#     gen_text = tokenizer.batch_decode(
#         new_tokens[:, tokenized_chat.shape[1] :], skip_special_tokens=True
#     )[0]

#     print("#######################")
#     print(f"Front: {deck[i].front}\nBack: {deck[i].back}")
#     print(gen_text)

### Using vLLM

In [10]:
from vllm import LLM, SamplingParams

In [11]:
llm = LLM(
    model=model_id,
    enable_prefix_caching=False,
    gpu_memory_utilization=0.90,
    max_model_len=4096,
    # cpu_offload_gb=10,
    trust_remote_code=True,
)
sampling_params = SamplingParams(temperature=0.6, top_p=0.9)

INFO 08-27 22:26:31 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='meta-llama/Meta-Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Meta-Llama-3.1-8B-Instruct, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 08-27 22:26:32 model_runner

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 08-27 22:26:37 model_runner.py:890] Loading model weights took 14.9888 GB
INFO 08-27 22:26:38 gpu_executor.py:121] # GPU blocks: 2398, # CPU blocks: 2048
INFO 08-27 22:26:39 model_runner.py:1181] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-27 22:26:39 model_runner.py:1185] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-27 22:26:54 model_runner.py:1300] Graph capturing finished in 14 secs.


In [12]:
def generate(system_msg):
    for i in range(10):
        user_msg = f"""Front: {deck[i].front}
        Back: {strip_tags(deck[i].back)}
        """

        conversation = [
            {
                "role": "system",
                "content": system_msg,
            },
            {"role": "user", "content": user_msg},
        ]
        outputs = llm.chat(
            conversation, sampling_params=sampling_params, use_tqdm=False
        )

        print("#######################")
        print(f"Front: {deck[i].front}\nBack: {strip_tags(deck[i].back)}")
        print(outputs[0].outputs[0].text)

In [13]:
system_msg = """
Edit this Anki card:
- Make it concise, simple, distinct
- Follow formatting rules

Front: [edited front]
Back: [edited back]

Terminal commands:
```bash
$ command here
```

Code:
```language
code here
```

No explanations.

Example 1:
Front: What command does extract files from a zip archive?
Back: ```bash
$ unzip <file>
```
Front: Extract files from zip archive
Back: ```bash
$ unzip <file>

Example 2:
Front: What is the command to print manual or get help for a command?
Back: ```bash
$ man ...
```
Front: Print manual or get help for a command
Back: ```bash
$ man ...
```

Example 3: 
Front: What command does create a soft link?
Back: ```bash
$ ln -s <file_name> <link_name>
```
Front: Create a soft link
Back: 
```bash
$ ln -s <file> <link>
```

Example 4:
Front: In the `ln -s` command, what is the order of file name and link name?
Back: ```bash
$ ln -s <file_name> <link_name>
```
Front: ln -s command order of arguments
Back: <file> then <link>
"""

generate(system_msg)

#######################
Front: What command does create a soft link?
Back: ```bash
$ ln -s <file_name> <link_name>
```
Front: Create a soft link
Back: ```bash
$ ln -s
#######################
Front: In the `ln -s` command, what is the order of file name and link name?
Back: ```bash
$ ln -s <file_name> <link_name>
```
Front: ln -s command order of arguments
Back: <file> then
#######################
Front: What command does extract files from a zip archive?
Back: ```bash
$ unzip <file>
```
Front: Extract files from zip archive
Back: ```bash
$ unzip <
#######################
Front: What is the command to list the content of a directory?
Back: ```bash
$ ls <path>
```
Front: List directory content
Back: ```bash
$ ls <path>

#######################
Front: What is the command to print text to the terminal window?
Back: ```bash
$ echo ...
```
Front: Print text to terminal
Back: ```bash
$ echo ...
```
#######################
Front: What is the command to create a new file?
Back: ```bash
$ touch 

In [14]:
system_msg = """
Optimize Anki cards:
- Concise, simple, distinct
- Follow format rules

Reply in this format:
Front: [edited front]
Back: [edited back]

Terminal commands:
```bash
$ command <placeholder>
```

Code:
```language
code here
```

Use the following placeholders only: <file>, <path>, <link>, <command>.

No explanations.

Examples:
1. Front: Extract zip files
   Back: ```bash
   $ unzip <file>
   ```

2. Front: Get command manual/help
   Back: ```bash
   $ man <command>
   ```

3. Front: Create soft link
   Back: ```bash
   $ ln -s <file> <link>
   ```

4. Front: `ln -s` argument order
   Back: <file> then <link>
"""

generate(system_msg)

#######################
Front: What command does create a soft link?
Back: ```bash
$ ln -s <file_name> <link_name>
```
Front: What command does create a soft link?
Back: ```bash
$
#######################
Front: In the `ln -s` command, what is the order of file name and link name?
Back: ```bash
$ ln -s <file_name> <link_name>
```
Front: `ln -s` argument order
Back: <file> then
#######################
Front: What command does extract files from a zip archive?
Back: ```bash
$ unzip <file>
```
Front: Extract files from a zip archive
Back: ```bash
$ unzip
#######################
Front: What is the command to list the content of a directory?
Back: ```bash
$ ls <path>
```
Front: List directory contents
Back: ```bash
$ ls <path>

#######################
Front: What is the command to print text to the terminal window?
Back: ```bash
$ echo ...
```
Front: Print text to the terminal
Back: ```bash
$ echo <
#######################
Front: What is the command to create a new file?
Back: ```bash
$ touc