In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from textblob import TextBlob
import altair as alt
import json

In [3]:
df = pd.read_json("../data/raw_hotelreviews.json")

In [3]:
df.head(n=5)

Unnamed: 0.1,Unnamed: 0,ID,hotel,name,time,rating,site,trip type,room rating,location rating,service rating,text,hotel reply,reply date,timestamp
0,0,0,Ace Hotel Brooklyn,Shayna Petit,3 months ago,5/5,Google,Solo,5.0,5.0,5.0,"This hotel is truly amazing, from the building...","Hello Shayna, Firstly, thank you very much fo...",2 months ago,2022-04-08 17:26:45.147944
1,1,1,Ace Hotel Brooklyn,Ollie B,2 months ago,5/5,Google,Couple,5.0,5.0,5.0,The staff was terrific!. Geneva and Sergio wer...,"Hello Ollie, If this review did not just make...",2 months ago,2022-04-08 17:26:45.147944
2,2,2,Ace Hotel Brooklyn,Jazmyne G,2 months ago,5/5,Google,Vacation · Couple,5.0,5.0,5.0,Amazing stay for our first evening in Brooklyn...,"Hello Jazmyne, I hope that life is treating y...",2 months ago,2022-04-08 17:26:45.147944
3,3,3,Ace Hotel Brooklyn,Florivette Rosario,4 months ago,5/5,Google,,,,,"What an amazing place! Clean, quiet rooms and ...",Hello Florivette! Thank you very much for you...,4 months ago,2022-04-08 17:26:45.147944
4,4,4,Ace Hotel Brooklyn,Jillian Rubin,4 months ago,5/5,Google,Couple,4.0,4.0,5.0,Huge thank you to Darren at the front desk who...,Hello Jillian! Wow-weeeeee: If this note did ...,4 months ago,2022-04-08 17:26:45.147944


In [4]:
examples = [_ for _ in df['text'] ]

In [6]:
# Filter out records where the "text" column is missing
df_filtered = df.dropna(subset=['text'])

# Skip rows with empty 'text' after dropping NaN values
df_filtered = df_filtered[df_filtered['text'].str.strip() != '']

# Export to .jsonl file
with open('data/output.jsonl', 'w') as outfile:
    for _, row in df_filtered.iterrows():
        json.dump({'text': row['text']}, outfile)
        outfile.write('\n')

## Exploratory Data Analysis 

In [5]:
df.drop(columns=['name','reply date','time','Unnamed: 0', 'hotel reply','timestamp'], inplace=True)
df.head(3)
#dropped hotel reply and name as they contained reviewers names, reply_date, time, timestamp, and unamed: 0 were all unecessary

Unnamed: 0,ID,hotel,rating,site,trip type,room rating,location rating,service rating,text
0,0,Ace Hotel Brooklyn,5/5,Google,Solo,5.0,5.0,5.0,"This hotel is truly amazing, from the building..."
1,1,Ace Hotel Brooklyn,5/5,Google,Couple,5.0,5.0,5.0,The staff was terrific!. Geneva and Sergio wer...
2,2,Ace Hotel Brooklyn,5/5,Google,Vacation · Couple,5.0,5.0,5.0,Amazing stay for our first evening in Brooklyn...


In [6]:
df.shape

(10770, 9)

In [7]:
# Checking for missing values in the 'text' column which are the hotel reviews
missing_reviews = df['text'].isnull().sum()
# Removing any rows where the text column is missing
hotel_reviews_df_cleaned = df.dropna(subset=['text'])
print("Original number of reviews:", df.shape[0])
print("Number of missing reviews:", missing_reviews)
print("Number of reviews after cleaning:", hotel_reviews_df_cleaned.shape[0])

Original number of reviews: 10770
Number of missing reviews: 3725
Number of reviews after cleaning: 7045


In [8]:
hotel_reviews_df_cleaned["trip type"].unique()

array(['Solo', 'Couple', 'Vacation · Couple', None, 'Business',
       'Friends · Vacation', 'Couple · Vacation', 'Vacation', 'Family',
       'Friends', 'Vacation · Solo', 'Vacation · Family',
       'Friends · Business', 'Vacation · Friends', 'Family · Vacation',
       'Vacation · Family · Friends · Couple', 'Business · Solo',
       'Vacation · Family · Business', 'Business · Vacation · Friends',
       'Vacation · Couple · Business',
       'Business · Vacation · Couple · Solo',
       'Vacation · Couple · Friends', 'Friends · Vacation · Couple',
       'Solo · Couple', 'Couple · Business',
       'Couple · Friends · Family · Vacation',
       'Family · Friends · Vacation', 'Business · Solo · Vacation',
       'Vacation · Business · Couple', 'Solo · Vacation'], dtype=object)

In [9]:
# Changing 'NaN' to 'None' for grouping purposes
hotel_reviews_df_cleaned['trip type'] = hotel_reviews_df_cleaned['trip type'].fillna('None')

# Splitting trip type into individual categories and exploding the DataFrame
trip_types_expanded = hotel_reviews_df_cleaned['trip type'].str.split(' · ').explode()

data_expanded = hotel_reviews_df_cleaned.loc[trip_types_expanded.index].copy()
data_expanded['trip type'] = trip_types_expanded

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hotel_reviews_df_cleaned['trip type'] = hotel_reviews_df_cleaned['trip type'].fillna('None')


# Feature Engineering

In [10]:
#Converted rating column into numeric value for visualization. '5/5'-> 5.0
data_expanded['rating'] = data_expanded['rating'].apply(lambda x: float(x[0]))
data_expanded.head(3)

Unnamed: 0,ID,hotel,rating,site,trip type,room rating,location rating,service rating,text
0,0,Ace Hotel Brooklyn,5.0,Google,Solo,5.0,5.0,5.0,"This hotel is truly amazing, from the building..."
1,1,Ace Hotel Brooklyn,5.0,Google,Couple,5.0,5.0,5.0,The staff was terrific!. Geneva and Sergio wer...
2,2,Ace Hotel Brooklyn,5.0,Google,Vacation,5.0,5.0,5.0,Amazing stay for our first evening in Brooklyn...


# Exploratory Visualizations

Although this isn't related to text classification, I was curious to see the distribution of ratings for the different categories. Of the three rating categories (Room, Service, Location), service had the highest frequency of 1 star reviews.

In [14]:
room_ratings = data_expanded["room rating"].value_counts().reset_index(name="count")
service_ratings = data_expanded["service rating"].value_counts().reset_index(name="count")
location_ratings = data_expanded["location rating"].value_counts().reset_index(name="count")
ratings_count = data_expanded["rating"].value_counts().reset_index(name="count")

In [18]:
room_chart = alt.Chart(room_ratings).mark_bar().encode(
    x = alt.X("room rating:N", title="Room Rating"),
    y = alt.Y("count:Q", title="Number of Reviews"),
    color = alt.Color("room rating:N", legend=None)
).properties(
    title = "Room Rating Distribution"
)
room_chart

In [19]:
service_chart = alt.Chart(service_ratings).mark_bar().encode(
    x = alt.X("service rating:N", title="Service Rating"),
    y = alt.Y("count:Q", title="Number of Reviews"),
    color = alt.Color("service rating:N", legend=None)
).properties(
    title = "Service Rating Distribution"
)
service_chart

In [20]:
location_chart = alt.Chart(location_ratings).mark_bar().encode(
    x = alt.X("location rating:N", title="Location Rating"),
    y = alt.Y("count:Q", title="Number of Reviews"),
    color = alt.Color("location rating:N", legend=None)
).properties(
    title = "Location Rating Distribution"
)
location_chart

In [21]:
ratings_chart = alt.Chart(ratings_count).mark_bar().encode(
    x = alt.X("rating:N", title="Rating"),
    y = alt.Y("count:Q", title="Number of Reviews"),
    color = alt.Color("rating:N", legend=None)
).properties(
    title = "Overall Rating Distribution"
)
ratings_chart

There is an overwhelming majority of 5 star reviews across all of the different ratings categories. What about the average ratings across the self described trip types?

In [22]:
#Average ratings for room, location, and service by trip type
average_ratings_by_trip_type = data_expanded.groupby('trip type').agg({
    'room rating': 'mean',
    'location rating': 'mean',
    'service rating': 'mean',
    'rating': 'mean'
}).reset_index()

average_ratings_by_trip_type

Unnamed: 0,trip type,room rating,location rating,service rating,rating
0,Business,4.2,4.6,4.0,3.865385
1,Couple,3.467742,4.203125,3.738462,3.318841
2,Family,3.388889,4.315789,4.0,3.190476
3,Friends,4.125,4.516129,4.382353,3.916667
4,,4.307692,4.466667,4.133333,4.378629
5,Solo,4.625,4.941176,4.75,4.428571
6,Vacation,3.715596,4.431193,3.6875,3.530769


"Solo" trip type tend to have the highest average ratings across room, location, and service, while "Family" and "Vacation" trip types have lower averages across all rating categories. The "None" category, representing missing or unspecified trip types, also shows relatively high average ratings. This makes sense as most customers who leave reviews either have very postive or very negative experiences.

In [23]:
testdata = pd.read_json("../data/raw_hotelreviews.json")
testdata.head(2)

Unnamed: 0.1,Unnamed: 0,ID,hotel,name,time,rating,site,trip type,room rating,location rating,service rating,text,hotel reply,reply date,timestamp
0,0,0,Ace Hotel Brooklyn,Shayna Petit,3 months ago,5/5,Google,Solo,5.0,5.0,5.0,"This hotel is truly amazing, from the building...","Hello Shayna, Firstly, thank you very much fo...",2 months ago,2022-04-08 17:26:45.147944
1,1,1,Ace Hotel Brooklyn,Ollie B,2 months ago,5/5,Google,Couple,5.0,5.0,5.0,The staff was terrific!. Geneva and Sergio wer...,"Hello Ollie, If this review did not just make...",2 months ago,2022-04-08 17:26:45.147944
