*This NB ran LOCALLY (due to errors we encountered with the api key working on the server)*

# Imports and define OpenAI client

In [1]:
OPENAI_API_KEY="YOUR_OPEN_AI_KEY"
import json
import pandas as pd
import numpy as np
from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)

# Define Functions

In [10]:
def get_hotel_vector(customer_review):
    """
    Analyzes a hotel review for the presence of predefined topics and returns a binary vector.

    Given a customer review, this function sends the review to an AI model to check for the existence 
    of 8 specific topics: cleanliness, comfort, service, food, location, facilities, maintenance, 
    and value for money. For each topic mentioned in the review, the function returns 1, otherwise 0. 
    The result is an 8-element numpy array of binary values. If the AI response is invalid, a default 
    array of zeros is returned.

    Args:
        customer_review (str): The hotel review text.

    Returns:
        numpy.ndarray: An array of binary values indicating the presence of each topic.
    """
    completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
      {"role": "system", "content": """You are an expert in analyzing hotel reviews. 
      You recieve hotel reviews and check for the existence of 8 topics defined as a python list: [cleanliness, comfort, service, food, location, facilities, maintenance, value for money(if something is free or if you have to pay for it)].
        For each topic that is mentioned or appears in any way in the review you output 1, else you output 0. the only end result should be a python list that contains 0 or 1 for each topic. do not output anything other than this list. if you are unsure, output [0,0,0,0,0,0,0,0].
      """},
      {"role": "user", "content": f"{customer_review}"}
    ]
  )
    default_array = np.array([0, 0, 0, 0, 0, 0, 0, 0])
    
    try:
        result_list = json.loads(completion.choices[0].message.content)
        
        # Check if the result is a list
        if not isinstance(result_list, list):
            return default_array
        
        return np.array(result_list)
    
    except (json.JSONDecodeError, TypeError, AttributeError) as e:
        return default_array

def get_reviews_vectors_dataframe(positive_reviews, negative_reviews):
    """
    Converts lists of positive and negative hotel reviews into a dataframe of review vectors.

    For each review in the positive and negative review lists, this function generates vectors 
    using the get_hotel_vector function. It creates a dataframe containing vectors for positive 
    reviews, negative reviews, and their differences. The intermediate dataframes are saved to 
    CSV files.

    Args:
        positive_reviews (list of str): List of positive hotel review texts.
        negative_reviews (list of str): List of negative hotel review texts.

    Returns:
        pandas.DataFrame: A dataframe with columns 'positive_vectors', 'negative_vectors', and 'diff_vectors'.
    """

    positive_vectors = [get_hotel_vector(review) for review in positive_reviews]
    print('finished pos')
    temp_df = pd.DataFrame({'positive_vectors': positive_vectors})
    temp_df.to_csv('temp_save_pos.csv')
    negative_vectors = [get_hotel_vector(review) for review in negative_reviews]
    print('finished neg')
    temp_df = pd.DataFrame({'positive_vectors': positive_vectors, 'negative_vectors': negative_vectors})
    temp_df.to_csv('temp_save.csv')
    diff_vectores = [pos_vec-neg_vec for pos_vec,neg_vec in zip(positive_vectors,negative_vectors)]
    print('finished diff')

    return pd.DataFrame({'positive_vectors': positive_vectors, 'negative_vectors': negative_vectors, 'diff_vectores':diff_vectores})

# Load output dataframe from 1_sentiment_and_length_ration_analysis.ipynb

In [19]:
df = pd.read_csv('combined_reviews_with_sentiment_scores.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,ID,positive_review,negative_review,rating,concat_review,positive_score,negative_score,positive_to_negative_sentiment_ratio,is_good_review,positive_review_length,negative_review_length,positive_to_negative_length_ratio,review_length,hotel_positive_vectors,hotel_negative_vectors,hotel_metrices_vector
0,0,2411,Very nice room,Not too much choice for breakfast and the swim...,6.0,Very nice room. Not too much choice for breakf...,0.171,0.0,100.0,0,3,13,0.230769,17,[0 1 0 0 0 0 0 0],[0 0 0 1 0 0 0 1],[ 0 1 0 -1 0 0 0 -1]
1,1,2272,Top facilities,Heating didn't work. It was warmer after I swi...,8.3,Top facilities. Heating didn't work. It was wa...,0.095,0.0,100.0,1,2,50,0.04,53,[0 0 0 0 0 1 0 0],[0 0 0 1 0 0 1 1],[ 0 0 0 -1 0 1 -1 -1]
2,2,2561,,Badly maintained hotel many switches not funct...,6.3,nan. Badly maintained hotel many switches not ...,0.0,0.242,0.0,0,1,18,0.055556,20,[0 0 0 0 0 0 0 0],[0 0 0 0 0 0 1 1],[ 0 0 0 0 0 0 -1 -1]
3,3,1321,"Beautiful little balconette, gorgeous bathroom...","A few little stains, no coffee machine and emp...",8.0,"Beautiful little balconette, gorgeous bathroom...",0.355,0.116,3.060345,1,15,13,1.153846,28,[1 0 1 0 0 0 0 0],[1 0 0 0 0 0 1 0],[ 0 0 1 0 0 0 -1 0]
4,4,1639,"Everything was good, location was good for sho...",Everything was comforting,10.0,"Everything was good, location was good for sho...",0.451,0.0,100.0,1,25,3,8.333333,28,[0 0 0 0 1 0 0 0],[0 1 0 0 0 0 0 0],[ 0 -1 0 0 1 0 0 0]


# Create categories vector for each review using OpenAI ChatGPT (WARNING- might take some time..):

In [15]:
results_df = get_reviews_vectors_dataframe(positive_reviews=df['positive_review'].to_list(), negative_reviews=df['negative_review'].to_list())
results_df.head()

finished pos
finished neg
finished diff


Unnamed: 0,positive_vectors,negative_vectors,diff_vectores
0,"[0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 1, 0, 0, 0, 1]","[0, 1, 0, -1, 0, 0, 0, -1]"
1,"[0, 0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 1, 0, 0, 1, 1]","[0, 0, 0, -1, 0, 1, -1, -1]"
2,"[0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 1, 1]","[0, 0, 0, 0, 0, 0, -1, -1]"
3,"[1, 0, 1, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 1, 0]","[0, 0, 1, 0, 0, 0, -1, 0]"
4,"[0, 0, 0, 0, 1, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0]","[0, -1, 0, 0, 1, 0, 0, 0]"


# Concat results to the main dataframe and save it:

In [17]:
df[['hotel_positive_vectors', 'hotel_negative_vectors', 'hotel_metrices_vector']] = results_df
df.to_csv('combined_reviews_with_sentiment_scores_all.csv')

In [18]:
df

Unnamed: 0,ID,positive_review,negative_review,rating,concat_review,positive_score,negative_score,positive_to_negative_sentiment_ratio,is_good_review,positive_review_length,negative_review_length,positive_to_negative_length_ratio,review_length,hotel_positive_vectors,hotel_negative_vectors,hotel_metrices_vector
0,2411,Very nice room,Not too much choice for breakfast and the swim...,6.0,Very nice room. Not too much choice for breakf...,0.171,0.000,100.000000,0,3,13,0.230769,17,"[0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 1, 0, 0, 0, 1]","[0, 1, 0, -1, 0, 0, 0, -1]"
1,2272,Top facilities,Heating didn't work. It was warmer after I swi...,8.3,Top facilities. Heating didn't work. It was wa...,0.095,0.000,100.000000,1,2,50,0.040000,53,"[0, 0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 1, 0, 0, 1, 1]","[0, 0, 0, -1, 0, 1, -1, -1]"
2,2561,,Badly maintained hotel many switches not funct...,6.3,nan. Badly maintained hotel many switches not ...,0.000,0.242,0.000000,0,1,18,0.055556,20,"[0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 1, 1]","[0, 0, 0, 0, 0, 0, -1, -1]"
3,1321,"Beautiful little balconette, gorgeous bathroom...","A few little stains, no coffee machine and emp...",8.0,"Beautiful little balconette, gorgeous bathroom...",0.355,0.116,3.060345,1,15,13,1.153846,28,"[1, 0, 1, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 1, 0]","[0, 0, 1, 0, 0, 0, -1, 0]"
4,1639,"Everything was good, location was good for sho...",Everything was comforting,10.0,"Everything was good, location was good for sho...",0.451,0.000,100.000000,1,25,3,8.333333,28,"[0, 0, 0, 0, 1, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0]","[0, -1, 0, 0, 1, 0, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7471,23647,"Location, room size, parking, staff",Noise from the highway with windows open.,9.0,"Location, room size, parking, staff. Noise fro...",0.000,0.000,0.000000,1,8,8,1.000000,17,"[0, 1, 1, 1, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]","[0, 1, 1, 1, 1, 0, 0, 0]"
7472,23652,The Locations is good.\nGood breakfast in COVI...,Very small room for a twin bed room.\nPoor ref...,6.0,The Locations is good.\nGood breakfast in COVI...,0.221,0.168,1.315476,0,12,23,0.521739,36,"[0, 0, 1, 1, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 1, 0]","[0, 0, 1, 1, 1, 0, -1, 0]"
7473,24232,Parking space outside the hotel,"When I arrived to check in nobody was there, I...",4.6,Parking space outside the hotel. When I arrive...,0.000,0.126,0.000000,0,5,64,0.078125,70,"[0, 0, 0, 0, 0, 0, 0, 1]","[1, 0, 1, 0, 0, 0, 1, 0]","[-1, 0, -1, 0, 0, 0, -1, 1]"
7474,23728,The staff were polite and proffesional.,The breakfast could be more fresh and should c...,7.0,The staff were polite and proffesional.. The b...,0.166,0.143,1.160839,0,7,65,0.107692,72,"[0, 0, 1, 0, 0, 0, 0, 0]","[0, 1, 0, 1, 0, 0, 0, 0]","[0, -1, 1, -1, 0, 0, 0, 0]"
