In [231]:
import pandas as pd
import seaborn as sns
import os
from openai import OpenAI
import configurations as c

In [232]:
df = pd.read_csv("data/DisneylandReviews.csv", encoding='latin1')
df['Branch'] = df['Branch'].astype('category')
df["Year_Month"] = pd.to_datetime(df["Year_Month"], format="%Y-%m", errors = "coerce")
df["Reviewer_Location"] = df["Reviewer_Location"].astype('category')
df = df.rename(columns={
    "Reviewer_Location" : "Location"
})

In [233]:
#make a new dataset randomly sampling 1000 entries
df_sample = df.sample(n=1000, random_state=1)
df_sample

Unnamed: 0,Review_ID,Rating,Year_Month,Location,Review_Text,Branch
12008,531138321,5,2017-09-01,United Kingdom,We visited for Mickey;s Not So Scary Halloween...,Disneyland_California
42394,25004598,1,NaT,United States,We arrived on 15th February 2009 and were due ...,Disneyland_Paris
24748,155279085,5,2012-12-01,United States,Nothing really new in the Magic Kingdom from t...,Disneyland_California
42609,3460644,1,NaT,United States,My husband and I recently returned from a long...,Disneyland_Paris
10719,594693531,5,2018-07-01,United States,"You will definitely want to plan out your day,...",Disneyland_California
...,...,...,...,...,...,...
18743,283963913,5,2015-06-01,United States,The park was awesome and all of the 60th decor...,Disneyland_California
34250,362480264,4,2016-04-01,United Kingdom,What more can I say my 2 kids 4 and 6 had an a...,Disneyland_Paris
34894,330029779,5,2015-11-01,New Zealand,Cannot fault the visit. The magic totally aliv...,Disneyland_Paris
38297,204781099,4,2014-05-01,United Kingdom,We Went to disneyland from Friday 2nd Monday 5...,Disneyland_Paris


In [234]:
from pydantic import BaseModel
from typing import List, Optional

class CodedElement(BaseModel):
    touchpoint: str
    sentiment: str # "positive" or "negative"
    code: str
    text_excerpt: str

class DemographicInfo(BaseModel):
    travel_party: Optional[str] = None
    first_visit: Optional[str] = None # "Yes", "No", or "Unknown"
    visit_timing: Optional[str] = None # Season, holiday, time of day, day of week

class ReviewAnalysis(BaseModel):
    review_id: str
    coded_elements: List[CodedElement]
    demographic_info: DemographicInfo

In [235]:
def prompt():
    """
    You are tasked with analyzing Disneyland reviews to identify the primary drivers of guest satisfaction across the complete guest journey.

    For each review, identify specific mentions that affected the guest experience at different touchpoints, marking them as either positive or negative.

    GUEST JOURNEY TOUCHPOINTS:
    Use ONLY the following touchpoint categories (do not create your own) - all in lowercase:
    - pre-visit (use for planning, booking, website experience, app usage before arrival)
    - entry/admission (use for parking, tickets, entry gates, security screening, arrival experience)
    - attractions (use for rides, interactive exhibits, wait times, ride operations)
    - entertainment (use for parades, shows, fireworks, street performers)
    - characters (use for character meet-and-greets, character interactions, photo opportunities)
    - staff (use for all employee interactions including ride operators, food service staff, retail staff)
    - food/beverage (use for restaurants, snack stands, food quality, dining experience)
    - retail (use for shopping experiences, merchandise, souvenirs)
    - facilities (use for restrooms, baby care, first aid, accessibility features)
    - cleanliness (use for park maintenance, trash management, overall park cleanliness)
    - navigation (use for park layout, wayfinding, walking experience, crowding)
    - value (use for price-related comments, perceived worth, packages, discounts)
    - atmosphere (use for theming, ambiance, music, decorations, overall feel)
    - timing (use ONLY when time of visit directly impacts satisfaction, such as "weekday visits are better" or "morning hours have shorter lines")
    - post-visit (use for follow-up, memories, overall reflections after leaving)
    - comparison (use when comparing to other Disney parks or similar attractions)

    DEMOGRAPHIC INFO:
    For demographic information, capture the following when mentioned (even if just mentioned factually without impact on satisfaction):
    - Travel party composition (family, couple, solo, friends)
    - First visit status (yes/no)
    - Visit timing (season, holiday, time of day, day of week)

    QUEUE-RELATED CODING GUIDELINES:
    For any mention of queues, waiting times, lines, or crowding across ANY touchpoint:
    1. Always include one of these terms in your code: "queue", "wait time", "line", or "crowding"
    2. Be specific about which service the queue is for (e.g., "short ride queues" rather than just "short queues")
    3. Place the queue code under the most relevant touchpoint:
       - attractions: for ride queues, virtual queues, FastPass systems
       - entry/admission: for entry gates, ticket booths, security screening
       - food/beverage: for restaurant waiting, ordering lines, pickup queues
       - entertainment: for show seating, parade viewing spots
       - characters: for character meet-and-greet lines
       - facilities: for restroom queues
       - navigation: for general crowding or movement flow issues

    CODES:
    Your codes must identify the specific driver of satisfaction or dissatisfaction, capturing the exact aspect that influenced the experience.
    Examples of good codes:
    - "efficient online ticket purchase" (pre-visit)
    - "short security line wait" (entry/admission)
    - "minimal ride queue times" (attractions)
    - "convenient virtual queue system" (attractions)
    - "manageable character line wait" (characters)
    - "knowledgeable staff recommendations" (staff)
    - "quick food service lines" (food/beverage)
    - "reasonable restroom wait times" (facilities)
    - "excessive parade viewing crowding" (entertainment)
    - "low general crowd density" (navigation)
    - "advantageous weekday crowds" (timing)

    IMPORTANT DISTINCTION:
    - Only code "timing" as a touchpoint when it DIRECTLY impacts satisfaction (e.g., "Going on weekdays was great because there were no lines!")
    - When timing is just mentioned factually without directly affecting satisfaction (e.g., "We visited in December"), capture it in demographic_info.visit_timing

    For each coded element, provide:
    - The specific touchpoint category (from the list above, in lowercase)
    - Sentiment as "positive" or "negative" (not "+" or "-")
    - A specific descriptive code (3-5 words that precisely identify the satisfaction driver)
    - The exact text excerpt from the review that supports this code (do not alter the quote)

    Structure your output according to the provided schema.
    Here is the review:
    """
    return prompt.__doc__

In [236]:
#if u want to use gemini
from google import genai

secret = c.GEMINI_API_KEY
client = genai.Client(api_key = secret)
def chatgpt(input):
    response = client.models.generate_content(
        model='gemini-2.0-flash',
        contents=input,
        config={
            'response_mime_type': 'application/json',
            'response_schema': ReviewAnalysis,
            'system_instruction': prompt() 
        },
    )
    return response.text

In [None]:
#if you want to use chatgpt
secret = c.OPENAI_API_KEY
client = OpenAI(api_key = secret)
def chatgpt(input_text: str) -> ReviewAnalysis:
    system_prompt = prompt()
    user_prompt = input_text 
    response = client.beta.chat.completions.parse(
        model="gpt-4o-mini-2024-07-18",
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        response_format = ReviewAnalysis
    )
    return response.choices[0].message

In [255]:
# function to analyze the dataframe of reviews
llm_output = []
for i in range(800, 1000):
    review = input = f"ID: {df_sample.iloc[i]['Review_ID']}, Review Text:{df_sample.iloc[i]["Review_Text"]}, Where the reviewer is from: {df_sample.iloc[i]['Location']}, Disney Branch: {df_sample.iloc[i]['Branch']}"
    response = chatgpt(review)
    print(response)
    print(f"{i}TH REVIEW")
    llm_output.append(response)

{
  "review_id": "74875940",
  "coded_elements": [
    {
      "touchpoint": "value",
      "sentiment": "negative",
      "code": "expensive overall experience",
      "text_excerpt": "It was so expensive"
    },
    {
      "touchpoint": "attractions",
      "sentiment": "negative",
      "code": "excessive ride queue times",
      "text_excerpt": "the lines are too long."
    },
    {
      "touchpoint": "attractions",
      "sentiment": "negative",
      "code": "long ride wait time",
      "text_excerpt": "The wiat time for a ride that was like 1 minute long, was at least 70 minutes"
    },
    {
      "touchpoint": "value",
      "sentiment": "negative",
      "code": "questionable cost worth",
      "text_excerpt": "Was it worth the cost? I doubt it."
    },
    {
      "touchpoint": "staff",
      "sentiment": "positive",
      "code": "staff trying their best",
      "text_excerpt": "The staff try their best I guess."
    },
    {
      "touchpoint": "value",
      "sentiment"

In [256]:
import json
import pandas as pd

def load_json_to_dataframe(returned_responses):
    # Initialize empty list to collect all review data
    all_reviews_data = []
    
    # If a single string is passed, convert to list
    if isinstance(returned_responses, str):
        returned_responses = [returned_responses]
    
    # Process each JSON string
    for returned_response in returned_responses:
        # Parse the JSON string
        data = json.loads(returned_response)
        
        # Validate with Pydantic using model_validate (v2 method)
        if isinstance(data, list):
            # If it's a list of reviews
            validated_data = [ReviewAnalysis.model_validate(item) for item in data]
        else:
            # If it's a single review
            validated_data = [ReviewAnalysis.model_validate(data)]
            
        # Normalize the data for a DataFrame
        for review in validated_data:
            # For each coded element in a review, create a row
            for element in review.coded_elements:
                row = {
                    'review_id': review.review_id,
                    'touchpoint': element.touchpoint,
                    'sentiment': element.sentiment,
                    'code': element.code,
                    'text_excerpt': element.text_excerpt,
                    'travel_party': review.demographic_info.travel_party,
                    'first_visit': review.demographic_info.first_visit,
                    'visit_timing': review.demographic_info.visit_timing
                }
                all_reviews_data.append(row)
    
    # Create DataFrame from all collected data
    if all_reviews_data:
        df = pd.DataFrame(all_reviews_data)
        return df
    else:
        # Return empty DataFrame if no valid data
        return pd.DataFrame()

In [257]:
n800to1000 = load_json_to_dataframe(llm_output)

In [258]:
n800to1000.to_csv("data/n800to1000.csv", index=False)