In [6]:
import pandas as pd
# Load data
data_path = "./dataset/restaurant/"
df_train = pd.read_csv(data_path + "train.csv")
df_test = pd.read_csv(data_path + "test.csv")
# keep attributes.Ambience, attributes.AgesAllowed, attributes.GoodForDancing, attributes.RestaurantsReservations, attributes.RestaurantsPriceRange2, stars, review, attributes.NoiseLevel, id, name, label for training dataset
columns_to_keep = [
        'name', 
        'review',
        'label'
    ]
df_train = df_train[columns_to_keep]
df_test = df_test[columns_to_keep[:-1]]

import string
import re

# remove sepcial characters
def remove_special_characters(text):
    # remove change line
    text = text.replace('\\n', ' ')
    # Replace special characters with a space
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    return text

df_train['review'] = df_train['review'].apply(remove_special_characters)
df_test['review'] = df_test['review'].apply(remove_special_characters)

print("text after removing special characters:", df_train['review'][0])

# remove urls and html tags
def remove_urls(text):
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'<.*?>', ' ', text)
    return text

df_train['review'] = df_train['review'].apply(remove_urls)
df_test['review'] = df_test['review'].apply(remove_urls)

print("text after removing urls and html tags:", df_train['review'][0])

# remove extra whitespaces
def remove_extra_whitespaces(text):
    text = re.sub(' +', ' ', text)
    return text

df_train['review'] = df_train['review'].apply(remove_extra_whitespaces)
df_test['review'] = df_test['review'].apply(remove_extra_whitespaces)

print("text after removing extra whitespaces:", df_train['review'][0])

text after removing special characters: So  we stopped here on our way to the Side Quest  which is just up the street    We did not know what to expect  but came on a 3  taco night     I m so glad we did    Mostly  the Rush Inn is a bar ish place  but there s plenty of back and side seating at booths all through  The place has a dark  rich wood tone  and the patrons were decent folks   The tacos were EXCEPTIONAL  and their selection is quite good  with all sorts of stuff I intend to try as soon as possible  I think this place would make a nice casual dinner lunch stop  and if you need to get beverage  the barkeep and wait staff will keep em coming    So  for a quick tasty bite  don t walk slow  RUSH IN TO THE RUSH INN  My boyfriend and I heard about this little place near Rush Inn that offered board games and poutine on Friday  When we arrived  they didn t have food because the chef didn t show up     so we ventured to this place a block down  Great vibe inside  A typical bar scene  bu

In [7]:
# print labels
print("labels:", df_train['label'].unique())

labels: ['american (traditional)' 'american (new)' 'mexican' 'chinese'
 'mediterranean' 'italian' 'thai' 'asian fusion' 'canadian (new)'
 'japanese']


# Various Sample Prompt

In [47]:
import ollama
from sklearn.metrics import accuracy_score
def chat_with_context(history):
    response = ollama.chat(model='llama3', messages=history)
    return response['message']['content']
chat_history = []

num_samples = 5
# Updated Task Description for Restaurant Type Prediction
task_description = "We need to predict the type restaurant based on its name and one customer review. The categories are 'American (Traditional)', 'American (New)', 'Mexican', 'Chinese', 'Mediterranean', 'Italian', 'Thai', 'Asian Fusion', 'Canadian (New)', 'Japanese'."

generate_samples_request = (
    f"As an advanced language model you should create {num_samples} samples for the task outlined below.\n"
    "Generate samples that are likely to be correctly classified into one of the specified categories and samples that might be misclassified according to the task instructions.\n\n"
    f"### Task Description:\n{task_description}\n\n"
    "### Requirements for Samples:\n"
    "1. Each sample must present a unique and intricate challenge.\n"
    "2. The complexity of the samples should be such that simply applying the given task instruction would likely lead to incorrect or incomplete results for some samples.\n"
    "3. The samples should cover a diverse range of scenarios within the scope of the task, avoiding repetition and predictability.\n"
    "4. Ensure that the samples, while challenging, remain realistic and pertinent to the task's context.\n"
    "Generate the samples keeping these requirements in mind.\n###"
)

chat_history.append({'role': 'user', 'content': task_description})
chat_history.append({'role': 'user', 'content': generate_samples_request})

samples_response = chat_with_context(chat_history)
chat_history.append({'role': 'assistant', 'content': samples_response})
print("Generated Samples:")
print(samples_response)

# Analyze Samples with Chain of Thought Request
analyze_samples_request = (
    f"Based on the provided samples: {samples_response}\n"
    "Please think step by step and derive general principles for classifying restaurants into the specified categories: 'American (Traditional)', 'American (New)', 'Mexican', 'Chinese', 'Mediterranean', 'Italian', 'Thai', 'Asian Fusion', 'Canadian (New)', 'Japanese'.\n"
    "Focus on creating broad, example-agnostic guidelines that can be applied universally to classify any restaurant based on its name and customer reviews. Conclude your analysis with clear, concise bullet points outlining:\n"
    "- The key indicators or characteristics that typically define each category.\n"
    "- Common pitfalls that might lead to misclassifications and strategies for avoiding these errors.\n"
    "- Criteria or rules under which circumstances each label should be predicted.\n"
    "These principles should help in accurately predicting the cuisine type of a restaurant without relying on specific examples."
)

chat_history.append({'role': 'user', 'content': analyze_samples_request})

# Retrieve and print the analysis from the language model
analysis_response = chat_with_context(chat_history)
chat_history.append({'role': 'assistant', 'content': analysis_response})
print("Analysis of Restaurant Classification Guidelines:")
print(analysis_response)

# Generate Optimized Prompt
generate_prompt_request = (
    f"Based on the following analysis: {analysis_response}\nGenerate an optimized prompt for predicting "
    "the type of restaurant. Ensure the model responds only with one of the specified categories: 'American (Traditional)', 'American (New)', 'Mexican', 'Chinese', 'Mediterranean', 'Italian', 'Thai', 'Asian Fusion', 'Canadian (New)', 'Japanese'.\n\n"
    "### Requirements for Optimized Prompt:\n"
    "1. The prompt must include a clear description of the task and the labels.\n"
    "2. It should provide a comprehensive criteria for classifying restaurants as one of the restaurant types: 'American (Traditional)', 'American (New)', 'Mexican', 'Chinese', 'Mediterranean', 'Italian', 'Thai', 'Asian Fusion', 'Canadian (New)', 'Japanese' based on the name and customer review.\b"
    "3. The prompt must ensure that the model responds strictly with one of 'American (Traditional)', 'American (New)', 'Mexican', 'Chinese', 'Mediterranean', 'Italian', 'Thai', 'Asian Fusion', 'Canadian (New)', 'Japanese'.\b"
    "4. The prompt should help the model avoid common pitfalls and misclassifications identified during the analysis.\n"
    "5. Ensure the language is unambiguous and tailored to maximize the model's prediction accuracy.\n"
    "6. Encourage the model to think step by step.\n"
    "Respond with no other explanation but only the content of the prompt that is ready for the model to predict\n"
    "Prompt:"
)

chat_history.append({'role': 'user', 'content': generate_prompt_request})

optimized_prompt_response = chat_with_context(chat_history)
chat_history.append({'role': 'assistant', 'content': optimized_prompt_response})
print("Optimized Prompt:")
print(optimized_prompt_response)


# Function to predict the cuisine type of a given restaurant
def get_prediction(restaurant_name, customer_review):
    prompt = (
        "Follow the guidelines of the prompt:\n"
        f"{optimized_prompt_response}\n\n"
        f"Restaurant Name: {restaurant_name}\n"
        f"Customer Review: {customer_review}\n"
        "### Requirements:\n"
        "1. Respond with only the restaurant type: 'American (Traditional)', 'American (New)', 'Mexican', 'Chinese', 'Mediterranean', 'Italian', 'Thai', 'Asian Fusion', 'Canadian (New)', 'Japanese'.\n"
        "2. Do not provide any additional text or explanation.\n"
        "Respond with only the cuisine name:"
    )
    response = ollama.generate(model='llama3', prompt=prompt)

    prediction = response['response'].strip().lower().replace("**", "").replace("'", "").replace('"', '')
    # if prediction not in ['American (Traditional)', 'American (New)', 'Mexican', 'Chinese', 'Mediterranean', 'Italian', 'Thai', 'Asian Fusion', 'Canadian (New)', 'Japanese']:
    #     prediction = "Invalid Prediction"
    return prediction

predictions = []

for i, row in df_train.iterrows():
    if i == 10000:
        break
    if i % 1000 == 0:
        print(f"Processed {i+1} samples")
    restaurant_name = row['name']
    customer_review = row['review']
    prediction = get_prediction(restaurant_name, customer_review)
    predictions.append(prediction)
    # print(f"Predicted Restaurant Type: {prediction}\n")
    # print(f"Actual Restaurant Type: {row['label']}\n")

accuracy = accuracy_score(df_train['label'][:len(predictions)], predictions)
print(f"Accuracy: {accuracy}")

Generated Samples:
Here are five samples that meet the requirements:

**Sample 1:**
Restaurant Name: Fusion Station
Customer Review: "The menu is a mix of Korean BBQ and American comfort food. I had the bulgogi beef sliders with kimchi slaw, and it was amazing! The atmosphere is modern and lively."

**Challenge:** This sample combines Asian fusion elements (Korean BBQ) with American comfort food, making it challenging to categorize based solely on the name.

**Sample 2:**
Restaurant Name: Bella Vita
Customer Review: "This cozy Italian place serves up authentic pasta dishes like nonna used to make. The fettuccine Alfredo is creamy and rich."

**Challenge:** The name suggests Mediterranean or Italian cuisine, but the customer review focuses specifically on traditional Italian food, making it unclear which category best fits.

**Sample 3:**
Restaurant Name: Tamarindo
Customer Review: "I tried the chiles rellenos with roasted vegetables, and it was love at first bite. The patio has a great

# Basic Prompt

In [50]:
def get_prediction(restaurant_name, customer_review):
    prompt = (
        "Predict the type of restaurant based on the given restaurant name and customer review.\n"
        f"Restaurant Name: {restaurant_name}\n"
        f"Customer Review: {customer_review}\n"
        "### Requirements:\n"
        "1. Respond with only the restaurant type: 'American (Traditional)', 'American (New)', 'Mexican', 'Chinese', 'Mediterranean', 'Italian', 'Thai', 'Asian Fusion', 'Canadian (New)', 'Japanese'.\n"
        "2. Do not provide any additional text or explanation.\n"
        "Respond with only the cuisine name:"
    )
    response = ollama.generate(model='llama3', prompt=prompt)

    prediction = response['response'].strip().lower()
    # if prediction not in ['American (Traditional)', 'American (New)', 'Mexican', 'Chinese', 'Mediterranean', 'Italian', 'Thai', 'Asian Fusion', 'Canadian (New)', 'Japanese']:
    #     prediction = "Invalid Prediction"
    return prediction

predictions = []

for i, row in df_train.iterrows():
    if i == 10000:
        break
    if i % 100 == 0:
        print(f"Processed {i+1} samples")
    restaurant_name = row['name']
    customer_review = row['review']
    prediction = get_prediction(restaurant_name, customer_review)
    predictions.append(prediction)
    # print(f"Predicted Restaurant Type: {prediction}\n")
    # print(f"Actual Restaurant Type: {row['label']}\n")

accuracy = accuracy_score(df_train['label'][:len(predictions)], predictions)
print(f"Accuracy on the first 100 samples: {accuracy}")

Processed 1 samples
Processed 101 samples
Processed 201 samples
Processed 301 samples
Processed 401 samples
Processed 501 samples
Processed 601 samples
Processed 701 samples
Processed 801 samples
Processed 901 samples
Processed 1001 samples
Processed 1101 samples
Processed 1201 samples
Processed 1301 samples
Processed 1401 samples
Processed 1501 samples
Processed 1601 samples
Processed 1701 samples
Processed 1801 samples
Processed 1901 samples
Processed 2001 samples
Processed 2101 samples
Processed 2201 samples
Processed 2301 samples
Processed 2401 samples
Processed 2501 samples
Processed 2601 samples
Processed 2701 samples
Processed 2801 samples
Processed 2901 samples
Processed 3001 samples
Processed 3101 samples
Processed 3201 samples
Processed 3301 samples
Processed 3401 samples
Processed 3501 samples
Processed 3601 samples
Processed 3701 samples
Processed 3801 samples
Processed 3901 samples
Processed 4001 samples
Processed 4101 samples
Processed 4201 samples
Processed 4301 samples


# zero shot COT prompt

In [53]:
def get_prediction(restaurant_name, customer_review):
    prompt = (
        "Predict the type of restaurant based on the given restaurant name and customer review.\n"
        "Let's think step by step:\n"
        f"Restaurant Name: {restaurant_name}\n"
        f"Customer Review: {customer_review}\n"
        "### Requirements:\n"
        "1. Respond with only the restaurant type: 'American (Traditional)', 'American (New)', 'Mexican', 'Chinese', 'Mediterranean', 'Italian', 'Thai', 'Asian Fusion', 'Canadian (New)', 'Japanese'.\n"
        "2. Do not provide any additional text or explanation.\n"
        "Respond with only the cuisine name:"
    )
    response = ollama.generate(model='llama3', prompt=prompt)

    prediction = response['response'].strip().lower()
    # if prediction not in ['American (Traditional)', 'American (New)', 'Mexican', 'Chinese', 'Mediterranean', 'Italian', 'Thai', 'Asian Fusion', 'Canadian (New)', 'Japanese']:
    #     prediction = "Invalid Prediction"
    return prediction

predictions = []

for i, row in df_train.iterrows():
    if i == 10000:
        break
    if i % 100 == 0:
        print(f"Processed {i+1} samples")
    restaurant_name = row['name']
    customer_review = row['review']
    prediction = get_prediction(restaurant_name, customer_review)
    predictions.append(prediction)
    # print(f"Predicted Restaurant Type: {prediction}\n")
    # print(f"Actual Restaurant Type: {row['label']}\n")

accuracy = accuracy_score(df_train['label'][:len(predictions)], predictions)
print(f"Accuracy on the first 100 samples: {accuracy}")

Processed 1 samples
Processed 101 samples
Processed 201 samples
Processed 301 samples
Processed 401 samples
Processed 501 samples
Processed 601 samples
Processed 701 samples
Processed 801 samples
Processed 901 samples
Processed 1001 samples
Processed 1101 samples
Processed 1201 samples
Processed 1301 samples
Processed 1401 samples
Processed 1501 samples
Processed 1601 samples
Processed 1701 samples
Processed 1801 samples
Processed 1901 samples
Processed 2001 samples
Processed 2101 samples
Processed 2201 samples
Processed 2301 samples
Processed 2401 samples
Processed 2501 samples
Processed 2601 samples
Processed 2701 samples
Processed 2801 samples
Processed 2901 samples
Processed 3001 samples
Processed 3101 samples
Processed 3201 samples
Processed 3301 samples
Processed 3401 samples
Processed 3501 samples
Processed 3601 samples
Processed 3701 samples
Processed 3801 samples
Processed 3901 samples
Processed 4001 samples
Processed 4101 samples
Processed 4201 samples
Processed 4301 samples


# few shot prompting

In [57]:
# use the first three samples to generate the few-shot text
few_shot_texts = df_train['review'][:3]
few_shot_labels = df_train['label'][:3]
def get_prediction(restaurant_name, customer_review):
    prompt = (
        "Here are three samples in the dataset to help you predict the type of restaurant based on the given restaurant name and customer review.\n"
        f"sample 1: {few_shot_texts[0]}. Label: {few_shot_labels[0]}\n"
        f"sample 2: {few_shot_texts[1]}. Label: {few_shot_labels[1]}\n"
        f"sample 3: {few_shot_texts[2]}. Label: {few_shot_labels[2]}\n"
        "Predict the type of restaurant based on the given restaurant name and customer review.\n"
        f"Restaurant Name: {restaurant_name}\n"
        f"Customer Review: {customer_review}\n"
        "### Requirements:\n"
        "1. Respond with only the restaurant type: 'American (Traditional)', 'American (New)', 'Mexican', 'Chinese', 'Mediterranean', 'Italian', 'Thai', 'Asian Fusion', 'Canadian (New)', 'Japanese'.\n"
        "2. Do not provide any additional text or explanation.\n"
        "Respond with only the cuisine name:"
    )
    response = ollama.generate(model='llama3', prompt=prompt)

    prediction = response['response'].strip().lower()
    # if prediction not in ['American (Traditional)', 'American (New)', 'Mexican', 'Chinese', 'Mediterranean', 'Italian', 'Thai', 'Asian Fusion', 'Canadian (New)', 'Japanese']:
    #     prediction = "Invalid Prediction"
    return prediction

predictions = []

for i, row in df_train.iterrows():
    if i == 10000:
        break
    if i % 100 == 0:
        print(f"Processed {i+1} samples")
    restaurant_name = row['name']
    customer_review = row['review']
    prediction = get_prediction(restaurant_name, customer_review)
    predictions.append(prediction)
    # print(f"Predicted Restaurant Type: {prediction}\n")
    # print(f"Actual Restaurant Type: {row['label']}\n")

accuracy = accuracy_score(df_train['label'][:len(predictions)], predictions)
print(f"Accuracy on the first 100 samples: {accuracy}")

Processed 1 samples
Processed 101 samples
Processed 201 samples
Processed 301 samples
Processed 401 samples
Processed 501 samples
Processed 601 samples
Processed 701 samples
Processed 801 samples
Processed 901 samples
Processed 1001 samples
Processed 1101 samples
Processed 1201 samples
Processed 1301 samples
Processed 1401 samples
Processed 1501 samples
Processed 1601 samples
Processed 1701 samples
Processed 1801 samples
Processed 1901 samples
Processed 2001 samples
Processed 2101 samples
Processed 2201 samples
Processed 2301 samples
Processed 2401 samples
Processed 2501 samples
Processed 2601 samples
Processed 2701 samples
Processed 2801 samples
Processed 2901 samples
Processed 3001 samples
Processed 3101 samples
Processed 3201 samples
Processed 3301 samples
Processed 3401 samples
Processed 3501 samples
Processed 3601 samples
Processed 3701 samples
Processed 3801 samples
Processed 3901 samples
Processed 4001 samples
Processed 4101 samples
Processed 4201 samples
Processed 4301 samples
