# Environment Setting

In [None]:
pip install dspy-ai

In [None]:
!pip install --upgrade openai

In [None]:
import openai
from openai import OpenAI
import os
import pandas as pd
import dspy
import math

In [None]:
trainset = pd.read_csv('trainset.csv')
testset = pd.read_csv('testset.csv')

In [None]:
validation = pd.read_csv('correct_labels_cleaned.csv')

In [None]:
trainset.columns

Index(['Unnamed: 0', 'restaurant_name', 'Name', 'Profile Location', 'Score',
       'Date', 'Elite 24', 'Friends', 'Reviews', 'Photos', 'reserved',
       'pictures', 'checkin', 'Comment', 'Helpful', 'Thanks', 'Love this',
       'Oh no', 'Reply Date', 'Reply Content'],
      dtype='object')

# the simplest answer

In [None]:
def summarization(text):
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Use a supported chat model
        messages=[
            {"role": "system", "content": """You are a helpful assistant for analyzing online reviews for restaurants.
                                              You will be provided a review for a restaurant in new york city"""},
            {"role": "user", "content": f"Summarize the business issues and good feedback from the review:\n\n{text}"}
        ],
        max_tokens=100
    )
    return response.choices[0].message.content.strip()  # Extract content from the message object

In [None]:
trainset_00 = trainset.copy()
trainset_00['summary'] = trainset_00['Comment'].apply(summarize_text)

In [None]:
trainset_00.columns

Index(['Unnamed: 0', 'restaurant_name', 'Name', 'Profile Location', 'Score',
       'Date', 'Elite 24', 'Friends', 'Reviews', 'Photos', 'reserved',
       'pictures', 'checkin', 'Comment', 'Helpful', 'Thanks', 'Love this',
       'Oh no', 'Reply Date', 'Reply Content', 'topics', 'summary'],
      dtype='object')

In [None]:
testset_00 = testset.copy()
testset_00['summary'] = testset_00['Comment'].apply(summarize_text)

In [None]:
testset_00.columns

Index(['Unnamed: 0', 'restaurant_name', 'Name', 'Profile Location', 'Score',
       'Date', 'Elite 24', 'Friends', 'Reviews', 'Photos', 'reserved',
       'pictures', 'checkin', 'Comment', 'Helpful', 'Thanks', 'Love this',
       'Oh no', 'Reply Date', 'Reply Content', 'summary'],
      dtype='object')

In [None]:
trainset_00 = trainset_00[['Comment', 'summary']]

In [None]:
trainset_00.to_csv('trainset_00.csv', index=False)

In [None]:
testset_00 = testset_00[['Comment', 'summary']].to_csv('testset_00.csv', index=False)

# V1

In [None]:
def summarize_text(text):
    response = client.chat.completions.create(  # Use ChatCompletion instead of Completion
        model="gpt-4o-mini",  # Update to a supported model
        messages=[
            {"role": "system", "content": "You are a helpful assistant for reviews of restaurants."},
            {"role": "user", "content": f"what business issues or good feedback are mentioned in this review?:\n\n{text}"}
        ],
        max_tokens=500
    )
    return response.choices[0].message.content.strip()  # Access content from message

In [None]:
def extract_topics(text):
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Use a supported chat model
        messages=[
            {"role": "system", "content": """You are a helpful assistant for analyzing online reviews for restaurants.
                                              You will be provided a review for a restaurant in new york city"""},
            {"role": "user", "content": f"Extract key topics of business process mentioned from the following text:\n\n{text}"}
        ],
        max_tokens=500
    )
    return response.choices[0].message.content.strip()  # Extract content from the message object

In [None]:
#full dataset
trainset['summary'] = trainset['Comment'].apply(summarize_text)

In [None]:
trainset['topics'] = trainset['summary'].apply(extract_topics)

In [None]:
trainset = trainset[['Comment','summary','topics']]

In [None]:
trainset.to_csv('trainset_v1.csv', index=False)

In [None]:
testset_v1 = testset.copy()
testset_v1['summary'] = testset_v1['Comment'].apply(summarize_text)
testset_v1['topics'] = testset_v1['summary'].apply(extract_topics)
testset_v1 = testset_v1[['Comment','summary','topics']]
testset_v1.to_csv('testset_v1.csv', index=False)

# v2

In [None]:
def extract_topics(text):
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Use a supported chat model
        messages=[
            {"role": "system", "content": "You are a helpful assistant for analyzing online reviews for restaurants. You will be provided a review for a restaurant in new york city"},
            {"role": "user", "content": f"Extract key topics of business process mentioned from the following text:\n\n{text}"}
        ],
        max_tokens=100
    )
    return response.choices[0].message.content.strip()  # Extract content from the message object

In [None]:
def summarize_text(text):
    response = client.chat.completions.create(  # Use ChatCompletion instead of Completion
        model="gpt-4o-mini",  # Update to a supported model
        messages=[
            {"role": "system", "content": "You are a helpful assistant for reviews of restaurants."},
            {"role": "user", "content": f"what business issues or good feedback are mentioned in this review?:\n\n{text}"}
        ],
        max_tokens=500
    )
    return response.choices[0].message.content.strip()  # Access content from message

In [None]:
trainset_v2 = trainset.copy()
trainset_v2['topics'] = trainset_v2['Comment'].apply(extract_topics)
trainset_v2['summary'] = trainset_v2['topics'].apply(summarize_text)

In [None]:
trainset_v2.to_csv('trainset_v2.csv', index=False)

In [None]:
testset_v2 = testset.copy()
testset_v2['topics'] = testset_v2['Comment'].apply(extract_topics)
testset_v2['summary'] = testset_v2['topics'].apply(summarize_text)

In [None]:
testset_v2.to_csv('testset_v2.csv', index=False)

# Prompt Engineering: summarize and extract

In [None]:
def summarize_text(text):
    response = client.chat.completions.create(  # Use ChatCompletion instead of Completion
        model="gpt-4o-mini",  # Update to a supported model
        messages=[
            {"role": "system", "content": "You are a helpful assistant for reviews of restaurants."},
            {"role": "user", "content": f"Summarize the following review:\n\n{text}"}
        ],
        max_tokens=150
    )
    return response.choices[0].message.content.strip()  # Access content from message

In [None]:
def extract_topics(text):
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Use a supported chat model
        messages=[
            {"role": "system", "content": "You are a helpful assistant for analyzing online reviews for restaurants. You will be provided a review for a restaurant in new york city"},
            {"role": "user", "content": f"Extract key topics from the following text:\n\n{text}"}
        ],
        max_tokens=50
    )
    return response.choices[0].message.content.strip()  # Extract content from the message object

In [None]:
trainset['Descriptions'] = trainset['Comment'].apply(summarize_text)

In [None]:
trainset['Categories'] = trainset['Descriptions'].apply(extract_topics)

In [None]:
trainset.rename(columns={'summary':'Descriptions', 'topics':'Categories'}, inplace=True)

In [None]:
trainset = trainset[['Comment','Descriptions','Categories']]

In [None]:
trainset.to_csv('trainset_plain_prompt.csv', index=False)

# prompt plain

In [None]:
def extract_topics(text):
  instructions = """Extract main business process topics with maximum 4 from the following review.
                    Categorize each issue under the relevant area
                    (Food Quality, Customer Service, Cleanliness, Ambiance, Value for Money, Order Accuracy and efficiency, Waiting time, kids or pets friendly, menu choice),
                    and provide a description under each category. no need mention topics that are not involved in the review."""
  output_format = """Structure each topic as follows:
                  - Category: <category of topics mentioned in the review>
                  - Description: <description of the topics in the review in one sentence>
                  """
  prompt = instructions + output_format + f"""the review is as follows: {text}"""
  response = client.chat.completions.create(
        model="gpt-4o-mini",  # Use a supported chat model
        messages=[
            {"role": "system", "content": "You are a helpful assistant for analyzing online reviews for restaurants. "},
            {"role": "user", "content": prompt}
        ],
        max_tokens= 150
    )
  return response.choices[0].message.content.strip()  # Extract content from the message object

# Prompt improved: category, description and attitude

In [None]:
def extract_topics(text):
  instructions = """Extract main business process topics with maximum 4 from the following review.
                    Categorize each issue under the relevant area
                    (Food Quality, Customer Service, Cleanliness, Ambiance, Value for Money, Order Accuracy and efficiency, Waiting time, kids or pets friendly, menu choice),
                    and provide a description under each category. no need mention topics that are not involved in the review."""
  output_format = """Structure each topic as follows:
                  - Category: <category of topics mentioned in the review>
                  - Desctiption: <description of the topics in the review in one sentence>
                  """
  prompt = instructions + output_format + f"""the review is as follows: {text}"""
  response = client.chat.completions.create(
        model="gpt-4o-mini",  # Use a supported chat model
        messages=[
            {"role": "system", "content": "You are a helpful assistant for analyzing online reviews for restaurants. "},
            {"role": "user", "content": prompt}
        ],
        max_tokens= 150
    )
  return response.choices[0].message.content.strip()  # Extract content from the message object

In [None]:
trainset_prompt = pd.read_csv('trainset_prompt.csv')

In [None]:
trainset_prompt = trainset.copy()
trainset['topics'] = trainset['Comment'].apply(extract_topics)
trainset[['Comment','topics']]

In [None]:
# Adjust the function to pair categories and descriptions correctly, even if lengths do not match
def clean_extracted_data(topic_text):
    lines = topic_text.split('\n')
    categories = []
    descriptions = []
    attitudes = []
    for line in lines:
        if 'Category:' in line:
            categories.append(line.replace('- Category:', '').strip())
        elif 'Description:' in line:
            descriptions.append(line.replace('- Description:', '').strip())
    # Pad the shorter list with empty strings to ensure equal length
    max_length = max(len(categories), len(descriptions), len(attitudes))
    categories += ['No Category'] * (max_length - len(categories))
    descriptions += ['No Description'] * (max_length - len(descriptions))
    return categories, descriptions

In [None]:
# Apply the updated function to each row
validation['Categories'], validation['Descriptions'] = zip(*validation['topics'].copy().apply(clean_extracted_data))

In [None]:
# Apply the updated function to each row
validation['Categories'], validation['Descriptions'] = zip(*validation['topics'].copy().apply(clean_extracted_data))

In [None]:
trainset.to_csv('trainset_prompt.csv')

# Validation reformat

In [None]:
validation = pd.read_csv('correct_labels.csv')

In [None]:
validation = validation[['Comment', 'topics', 'c1', 'd1', 'c2', 'd2', 'c3', 'd3',
      'c4', 'd4', 'missing', 'c5']]

In [None]:
# Adjust the function to pair categories and descriptions correctly, even if lengths do not match
def clean_extracted_data(topic_text):
    lines = topic_text.split('\n')
    categories = []
    descriptions = []
    for line in lines:
        if 'Category:' in line:
            categories.append(line.replace('- Category:', '').strip())
        elif 'Description:' in line:
            descriptions.append(line.replace('- Description:', '').strip())
    # Pad the shorter list with empty strings to ensure equal length
    max_length = max(len(categories), len(descriptions))
    categories += ['No Category'] * (max_length - len(categories))
    descriptions += ['No Description'] * (max_length - len(descriptions))
    return categories, descriptions

In [None]:
# Apply the updated function to each row
validation['Categories'], validation['Descriptions']= zip(*validation['topics'].copy().apply(clean_extracted_data))

In [None]:
def calculate_accuracies_with_percentage(row):
    issue_flag = False

    # Calculate the accuracy for categories
    if not math.isnan(row['missing']):
      category_scores = row[['c1', 'c2', 'c3', 'c4','missing']].sum(skipna=True)
      category_length = len(row['Categories']) + 1
    else:
      category_scores = row[['c1', 'c2', 'c3', 'c4']].sum(skipna=True)
      category_length = len(row['Categories'])
    category_accuracy = category_scores / category_length

    # Calculate the accuracy for descriptions
    description_scores = row[['d1', 'd2', 'd3', 'd4']].sum(skipna=True)
    description_length = len(row['Descriptions'])
    description_accuracy = description_scores/description_length

    # Final score as the average of the three accuracies
    final_score = (category_accuracy + description_accuracy) / 2

    return pd.Series({
        'Category Accuracy (%)': category_accuracy*100,
        'Description Accuracy (%)': description_accuracy*100,
        'Final Score (%)': final_score*100,
        'Data Issue Flag': issue_flag
    })

In [None]:
result_with_percentage['Final Score (%)'].value_counts()

Unnamed: 0_level_0,count
Final Score (%),Unnamed: 1_level_1
100.0,50


In [None]:
# Apply the function to the dataset
accuracies_with_percentage = validation.apply(calculate_accuracies_with_percentage, axis=1)

# Combine with the original data
result_with_percentage = pd.concat([validation, accuracies_with_percentage], axis=1)
result_with_percentage

In [None]:
result_with_percentage.to_csv('correct_labels_cleaned.csv')