In [13]:
import pandas as pd
from typing import List, Dict
import random
import pathlib
import jinja2
import openai
from dotenv import load_dotenv

## Few Shot classifier Logic

I initially developed the logic of the few-shot classifier here.  
I used this notebook to develop the backbone of my classifier. 

The final approach of my GPT few-shot classifier can be found in [gpt_intent_classifier.py](../gpt_intent_classifier.py)

PS: Some function signatures have changed in the final deliverable to adhere to a more object-oriented architecture. 

In [14]:
def create_label_intents(unique_intents: List[str]) -> List[Dict[str, str]]:
    """
    Generate a list of labels and intents based on a list of unique intents.

    Parameters:
        unique_intents (list): A list of unique intents.

    Returns:
        list: A list of dictionaries, where each dictionary contains a label and its corresponding intent.

    Example:
        >>> unique_intents = ['flight', 'airfare', 'ground_service']
        >>> labels = create_label_intents(unique_intents)
        >>> print(labels)
        [{'label': 0, 'intent': 'flight'}, {'label': 1, 'intent': 'airfare'}, {'label': 2, 'intent': 'ground_service'}]
    """
    labels = []
    label_counter = 0
    
    for intent in unique_intents:
        labels.append({"label": label_counter, "intent": intent})
        label_counter += 1
    
    return labels

def create_examples(user_prompts: List[str], intents: List[str], labels: List[Dict[str, str]]) -> List[Dict[str, any]]:
    """
    Create a list of examples with text and corresponding labels based on user prompts and intents.

    Parameters:
        user_prompts (list): A list of user prompts.
        intents (list): A list of intent descriptions.
        labels (list): A list of dictionaries containing label-intent mappings.

    Returns:
        list: A list of dictionaries, where each dictionary contains text and its corresponding label.

    Example:
        >>> user_prompts = ['Today was a horrible day', 'Yesterday was a great day']
        >>> intents = ['negative', 'positive']
        >>> labels = [{'label': 0, 'intent': 'negative'}, {'label': 1, 'intent': 'positive'}]
        >>> examples = create_examples(user_prompts, intents, labels)
        >>> print(examples)
        [{'text': 'Today was a horrible day', 'label': 0}, {'text': 'Yesterday was a great day', 'label': 1}]
    """
    examples = []
    
    for prompt, intent in zip(user_prompts, intents):
        label = next(item["label"] for item in labels if item["intent"] == intent)
        examples.append({"text": prompt, "label": label})
    
    return examples

def get_training_examples(examples: List[Dict[str, any]], max_per_label: int = 10) -> List[Dict[str, any]]:
    """
    Get a subset of examples for each label, limiting to a maximum number of examples per label.

    Parameters:
        examples (list): A list of examples, where each example is a dictionary containing text and label.
        max_per_label (int): Maximum number of examples per label.

    Returns:
        list: A list of dictionaries, containing a subset of examples for each label.

    Example:
        >>> training_examples = get_training_examples(examples, max_per_label=10)
        >>> print(len(training_examples))
        30  # Assuming there are 3 unique labels with 10 examples each
    """
    training_examples = []
    label_count = {}
    
    # Initialize label_count with all possible labels as keys and values set to 0
    all_labels = set(example["label"] for example in examples)
    for label in all_labels:
        label_count[label] = 0

    # Shuffle examples to ensure randomness
    random.shuffle(examples)

    for example in examples:
        label = example["label"]
        if label not in label_count:
            label_count[label] = 0

        # Check if the maximum limit for the label has been reached
        if label_count[label] < max_per_label:
            training_examples.append(example)
            label_count[label] += 1

        # Break the loop if all labels reached the maximum limit
        if all(count >= max_per_label for count in label_count.values()):
            break

    return training_examples

def get_prompt_template(template_file_path: str) -> str: 
    path = pathlib.Path(template_file_path)
    with path.open() as f:
        prompt_template = jinja2.Template(f.read())
        
    return prompt_template

def training_file_to_dataframe(train_ds_path: str, custom_headers = ['user_prompt', 'intents']) -> pd.DataFrame:
    # Read the TSV file into a pandas DataFrame
    train_ds = pd.read_csv(train_ds_path, sep='\t', header=None, names=custom_headers)

    return train_ds

def construct_prompt_from_template(template_file_path: str, training_examples: list, labels: list, text_to_classify: str) -> str:
    prompt_template = get_prompt_template(template_file_path = template_file_path)
    prompt = prompt_template.render(
        examples=training_examples,
        labels=labels,
        text=text_to_classify,
    )

    return prompt

def get_prediction_labels(prompt: str) -> str:
    load_dotenv() # openai api key
    
    completion = openai.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    )
    
    pred_labels = completion.choices[0].message.content
    return pred_labels

def handle_response(response):
    pass

def convert_pred_labels_to_intents(pred_labels: str, labels: list()) -> list():
    # Convert string to list of integers
    pred_labels = [int(x) for x in pred_labels.strip('[]').split(',')]
    # Map each integer to the corresponding label value
    predicted_intents = [labels[label]["intent"] for label in pred_labels]
    return predicted_intents

In [15]:

train_ds_path = '../data/atis/train.tsv'
template_file_path = '../data/few_shot_template.jinja2'

train_ds = training_file_to_dataframe(train_ds_path = train_ds_path)
train_ds

Unnamed: 0,user_prompt,intents
0,i want to fly from boston at 838 am and arrive...,flight
1,what flights are available from pittsburgh to ...,flight
2,what is the arrival time in san francisco for ...,flight_time
3,cheapest airfare from tacoma to orlando,airfare
4,round trip fares from pittsburgh to philadelph...,airfare
...,...,...
4629,what is the airfare for flights from denver to...,airfare
4630,do you have any flights from denver to baltimo...,flight
4631,which airlines fly into and out of denver,airline
4632,does continental fly from boston to san franci...,flight


In [16]:
labels = create_label_intents(unique_intents= train_ds['intents'].unique().tolist())
labels



[{'label': 0, 'intent': 'flight'},
 {'label': 1, 'intent': 'flight_time'},
 {'label': 2, 'intent': 'airfare'},
 {'label': 3, 'intent': 'aircraft'},
 {'label': 4, 'intent': 'ground_service'},
 {'label': 5, 'intent': 'airport'},
 {'label': 6, 'intent': 'airline'},
 {'label': 7, 'intent': 'distance'},
 {'label': 8, 'intent': 'abbreviation'},
 {'label': 9, 'intent': 'ground_fare'},
 {'label': 10, 'intent': 'quantity'},
 {'label': 11, 'intent': 'city'},
 {'label': 12, 'intent': 'flight_no'},
 {'label': 13, 'intent': 'capacity'},
 {'label': 14, 'intent': 'flight+airfare'},
 {'label': 15, 'intent': 'meal'},
 {'label': 16, 'intent': 'restriction'},
 {'label': 17, 'intent': 'airline+flight_no'},
 {'label': 18, 'intent': 'ground_service+ground_fare'},
 {'label': 19, 'intent': 'airfare+flight_time'},
 {'label': 20, 'intent': 'cheapest'},
 {'label': 21, 'intent': 'aircraft+flight+flight_no'}]

In [17]:
examples = create_examples(user_prompts= train_ds['user_prompt'].tolist(), intents= train_ds['intents'].tolist(), labels= labels)
examples

[{'text': 'i want to fly from boston at 838 am and arrive in denver at 1110 in the morning',
  'label': 0},
 {'text': 'what flights are available from pittsburgh to baltimore on thursday morning',
  'label': 0},
 {'text': 'what is the arrival time in san francisco for the 755 am flight leaving washington',
  'label': 1},
 {'text': 'cheapest airfare from tacoma to orlando', 'label': 2},
 {'text': 'round trip fares from pittsburgh to philadelphia under 1000 dollars',
  'label': 2},
 {'text': 'i need a flight tomorrow from columbus to minneapolis', 'label': 0},
 {'text': 'what kind of aircraft is used on a flight from cleveland to dallas',
  'label': 3},
 {'text': 'show me the flights from pittsburgh to los angeles on thursday',
  'label': 0},
 {'text': 'all flights from boston to washington', 'label': 0},
 {'text': 'what kind of ground transportation is available in denver',
  'label': 4},
 {'text': 'show me the flights from dallas to san francisco', 'label': 0},
 {'text': 'show me the f

In [18]:
training_examples = get_training_examples(examples= examples, max_per_label= 3)
training_examples


[{'text': 'nonstop flights denver to kansas city', 'label': 0},
 {'text': 'show me all flights from charlotte to philadelphia', 'label': 0},
 {'text': 'show me all flights from miami to new york', 'label': 0},
 {'text': 'what kind of ground transportation is there in dallas', 'label': 4},
 {'text': 'what is the ground transportation available in the city of philadelphia',
  'label': 4},
 {'text': "i 'd like to see all the economy fares from baltimore to philadelphia",
  'label': 2},
 {'text': 'what is the lowest fare united charges between boston and san francisco',
  'label': 2},
 {'text': 'what is the lowest fare from denver to pittsburgh', 'label': 2},
 {'text': 'what ground transportation is available at the atlanta airport',
  'label': 4},
 {'text': 'list all the airlines that fly into general mitchell international',
  'label': 6},
 {'text': 'does the philadelphia airport have a name', 'label': 5},
 {'text': 'please tell me how many nonstop flights there are from boston to atlant

In [19]:
# USER PRMPT
user_prmt = 'what types of ground transportation are available in denver'
prompt = construct_prompt_from_template(template_file_path= template_file_path, training_examples= training_examples, labels= labels, text_to_classify = user_prmt)
print('\n Prompt: ')
print(prompt)



 Prompt: 
I want you to classify text for me. Your response must always be a python-like list of the 3 most possible class labels like: [0, 12, 15]
See below all the possible labels and their description

"""
intent: flight
label: 0
"""

"""
intent: flight_time
label: 1
"""

"""
intent: airfare
label: 2
"""

"""
intent: aircraft
label: 3
"""

"""
intent: ground_service
label: 4
"""

"""
intent: airport
label: 5
"""

"""
intent: airline
label: 6
"""

"""
intent: distance
label: 7
"""

"""
intent: abbreviation
label: 8
"""

"""
intent: ground_fare
label: 9
"""

"""
intent: quantity
label: 10
"""

"""
intent: city
label: 11
"""

"""
intent: flight_no
label: 12
"""

"""
intent: capacity
label: 13
"""

"""
intent: flight+airfare
label: 14
"""

"""
intent: meal
label: 15
"""

"""
intent: restriction
label: 16
"""

"""
intent: airline+flight_no
label: 17
"""

"""
intent: ground_service+ground_fare
label: 18
"""

"""
intent: airfare+flight_time
label: 19
"""

"""
intent: cheapest
label: 20
""

In [20]:
response = get_prediction_labels(prompt= prompt)
response

'[4, 18, 5]'

In [21]:
## VALIDATE RESPONSE





In [22]:
pred_intents = convert_pred_labels_to_intents(pred_labels = response, labels=labels)
print('\n pred_intents: \n')
print(pred_intents)


 pred_intents: 

['ground_service', 'ground_service+ground_fare', 'airport']
