In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from helpers import template
import outlines
from outlines.samplers import greedy

In [3]:
model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
model = outlines.models.transformers(model_name)

### First Structure > Choice:

* When we want to label something with a finite number of possible labels, below is our prompt!

In [8]:
prompt = template("""
Look at this restaurant review and classify its sentiment. 
Respond only with 'positive' or 'negative':

Review: The pizza a the was delicious, and the service was excellent.
""")

In [9]:
chooser = outlines.generate.choice(
    model,
    ['positive', 'negative'],
    sampler=greedy()
)

In [10]:
chooser(prompt)

'positive'

### Second Structure > Phone Number:

* Here we want to extract phone number in specific format, 

In [11]:
phone_prompt = template("""
Extract the phone number from the example,
please use the format: (XXX) XXX-XXXX

206-555-1234

""")

In [12]:
phone_regex = r'\([0-9]{3}\) [0-9]{3}-[0-9]{4}'

In [13]:
phone_generator = outlines.generate.regex(
    model, 
    phone_regex,
    sampler=greedy()
)

In [14]:
phone_generator(phone_prompt)

'(206) 555-1234'

### Third Structure > Email Address:

In [15]:
email_regex = r'[a-zA-Z0-9]{3,10}@[a-z]{4,20}\.com'

In [16]:
email_prompt = template("Give me an email address for someone at amazon")

In [17]:
email_generator = outlines.generate.regex(
    model,
    email_regex,
    sampler=greedy())

In [18]:
email_generator(email_prompt)

'Icyblue@amazon.com'

### Forth Structure > HTML Image Tag:

* Here using regex to generate an HTML image tag based on the file name we provide to it.

In [19]:
example = '<img src="large_dinosaur.png" alt="Image of Large Dinosaur">'

In [20]:
img_tag_regex = r'<img src="\w+\.(png|jpg|gif)" alt="[\w ]+">'

In [21]:
import re

print(re.search(img_tag_regex, example)[0])

<img src="large_dinosaur.png" alt="Image of Large Dinosaur">


In [22]:
img_tag_generator = outlines.generate.regex(model, img_tag_regex)

In [23]:
img_tag = img_tag_generator(
    template(
        """Generate a basic html image tag for the file 'big_fish.png', 
        make sure to include an alt tag"""
    ))

In [24]:
print(img_tag)

<img src="big_fish.png" alt="Big Fish">


In [25]:
from IPython.display import HTML, display

display(HTML(img_tag))

### Fifth Structure > Tic-Tac-Toe:

In [26]:
ttt_regex = r'[XO ]\|[XO ]\|[XO ]\n-\+-\+-\n[XO ]\|[XO ]\|[XO ]\n-\+-\+-\n[XO ]\|[XO ]\|[XO ]'

In [27]:
ttt_generator = outlines.generate.regex(model, ttt_regex, sampler=greedy())

In [28]:
ttt_out = ttt_generator("""
We'll be representing an ASCII tic-tac-toe board like this:
```
 | | 
-+-+-
 | | 
-+-+-
 | | 
```
With X,O or a blank space being valid entries.
Here is an example game that is currently in progress:
"""
)

In [29]:
print(ttt_out)

X|O|O
-+-+-
 | | 
-+-+-
 | | 


### Sixth Structure > CSV:

* Generating CSV content straight from the model and dump it into a pandas dataframe. 

In [30]:
csv_regex = r'Code,Amount,Cost\n([A-Z]{3},[1]*[0-9],1]*[0-9]\.[0-9]{2}\n){1,3}'

In [31]:
csv_generator = outlines.generate.regex(model, csv_regex)

In [32]:
csv_out = csv_generator(
    template(
        """Create a CSV file for 2-3 store inventory items.
           Include a column 'Code', 'Amount', and 'Cost'.
        """)
)

In [33]:
from io import StringIO
import pandas as pd
pd.read_csv(StringIO(csv_out))

Unnamed: 0,Code,Amount,Cost
0,ABC,12,15.0
1,ABB,2,10.5
2,ACB,3,12.0


### Seventh Structure > GSM8K and Making REGEX easier:

* Implementing a structure for GSM8K, which is a common LLM evaluation benchmark that uses grade school questions to see if LLMs can answer them correctly.

In [34]:
from outlines.types import sentence, digit
from outlines.types.dsl import to_regex

# Write between 1-3 Sentences
reasoning = "Reasoning: " + sentence.repeat(1,2)
# Answer in 1-4 digits
answer = "So the answer is: " + digit.repeat(1,4)

to_regex(reasoning + "\n" + answer)

'Reasoning:\\ (([A-Z].*\\s*[.!?])){1,2}\\\nSo\\ the\\ answer\\ is:\\ ((\\d)){1,4}'

In [35]:
gsm8k_generator = outlines.generate.regex(
    model, 
    to_regex(reasoning + "\n" + answer),
    sampler=greedy()
)

In [36]:
question = """
Sally has 5 apples, then received 2 more, how many apples does Sally have?
"""

In [37]:
question_prompt = template(f"""
Please answer the question and the end using the following format:
Example:
Question: Tom has 3 cucumbers, Joes gives him 2 more cucumbers, 
          how many does Tom have?
Reasoning: Tom started with 3 cucumbers, then received 2 more. 
           This means he has 5 cucumbers.
So the answer is: 5

Here is the question you need to answer:
Question: {question}
""")

In [38]:
result = gsm8k_generator(question_prompt)

print(result)

Reasoning: Sally started with 5 apples, then received 2 more. This means she has 5 + 2 = 7 apples.
So the answer is: 7
