In [None]:
import pandas as pd
from rank_bm25 import BM25Okapi
from sklearn.metrics import mean_squared_error

# Load and prepare the dataset
df = pd.read_csv('reviews.csv')

# Flatten the JSON 'ratings' column
json_df = pd.json_normalize(df.ratings.apply(eval))
df = df.drop(columns=['ratings']).join(json_df)

# Remove unused columns and rows
df.drop(columns=['title', 'author', 'date_stayed', 'num_helpful_votes', 'date', 'id', 'via_mobile'], inplace=True, errors='ignore')
relevant_ratings = ['service', 'cleanliness', 'overall', 'value', 'location', 'sleep_quality', 'rooms']
df.dropna(subset=relevant_ratings, inplace=True)
df = df[~df['check_in_front_desk'].notna()]
df = df[~df['business_service_(e_g_internet_access)'].notna()]
df.drop(columns=['check_in_front_desk', 'business_service_(e_g_internet_access)'], inplace=True, errors='ignore')

# Concatenate reviews by 'offering_id'
df = df.groupby('offering_id').agg({
    'text': ' '.join,
    'service': 'mean',
    'cleanliness': 'mean',
    'overall': 'mean',
    'value': 'mean',
    'location': 'mean',
    'sleep_quality': 'mean',
    'rooms': 'mean'
}).reset_index()

# Tokenize the reviews
corpus = df['text'].tolist()
tokenized_corpus = [doc.split() for doc in corpus]

# Train the BM25 model
bm25 = BM25Okapi(tokenized_corpus)

# Function to predict the most relevant place
def predict_most_relevant_place(query, bm25, df):
    tokenized_query = query.split()
    doc_scores = bm25.get_scores(tokenized_query)
    best_doc_index = doc_scores.argmax()
    return df.iloc[best_doc_index]

# Example query
query = "great service and clean rooms"

# Predict the most relevant place
predicted_place = predict_most_relevant_place(query, bm25, df)

# Calculate MSE for each rating aspect
actual_ratings = df[relevant_ratings].mean()
predicted_ratings = predicted_place[relevant_ratings]

mse_scores = {aspect: mean_squared_error([actual_ratings[aspect]], [predicted_ratings[aspect]]) for aspect in relevant_ratings}

print("Predicted place:", predicted_place['offering_id'])
print("Actual ratings:", actual_ratings)
print("Predicted ratings:", predicted_ratings)
# Display the MSE scores
print("MSE Scores for each rating aspect:")
for aspect, mse in mse_scores.items():
    print(f"{aspect}: {mse}")