<img src="https://www.webintravel.com/wp-content/uploads/2019/04/GettyImages-802970402.jpg" alt="car price prediction" style="width:50%;"/>

# Import libraries

In [1]:
# Standard python libraries
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
import collections
import re

# Text analysis libraries
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('vader_lexicon')
nltk.download('stopwords')
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Encoding libraries
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

# Visualisation libraries
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

# Dataset split tool
from sklearn.model_selection import train_test_split

# Value assesment libraries
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif

import requests
from bs4 import BeautifulSoup as bs

# Machine learning libraries:
from sklearn.ensemble import RandomForestRegressor # model creation and training tool
from sklearn import metrics # tools for the model accuracy evaluation

# Input data files are available in the read-only "../input/" directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/mike/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mike/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Save the RANDOM_SEED so that the experiments are reproducible
RANDOM_SEED = 42

In [None]:
# Save the packages version so the experiments are reproducible
!pip freeze > requirements.txt

# Upload the data

In [5]:
# Upload the data (on Kaggle)

# DATA_DIR = '/kaggle/input/sf-booking/'
# df_train = pd.read_csv(DATA_DIR+'/hotels_train.csv') 
# df_test = pd.read_csv(DATA_DIR+'hotels_test.csv') 
# sample_submission = pd.read_csv(DATA_DIR+'/submission.csv')


# Upload the data

DATA_DIR = '/home/mike/Documents/Coding/Data/Booking reviews/'
df_train = pd.read_csv(DATA_DIR+'hotels_train.csv')
df_test = pd.read_csv(DATA_DIR+'hotels_test.csv')
sample_submission = pd.read_csv(DATA_DIR+'submission.csv')

In [None]:
# Merge the train and test dataframes to work with the values

df_train['sample'] = 1           # Mark the train lines
df_test['sample'] = 0            # Mark the test lines
df_test['reviewer_score'] = 0    # There is no reviewer_score yet but we will fill it with zeros for now and predict later

hotels = df_test.append(df_train, sort=False).reset_index(drop=True) # Merging 

In [None]:
hotels.info()

In [None]:
hotels.head(3)

# EDA

## Duplicates and Nulls

In [None]:
# Check the duplicates

duplicates = hotels[hotels.duplicated()]
print('Duplicates number: {}'.format(duplicates.shape[0]))

# The number of duplicates is insignificant

In [None]:
# Check the nulls or missing values

sns.heatmap(hotels.isnull())

null_data = hotels.isnull().sum()
display(null_data[null_data > 0])

# We will deal with the nulls in 'lat' and 'lng' later.

## Dates and time

In [None]:
# Convert the review date into the year, month, and day of week of the review

hotels['review_date'] = pd.to_datetime(hotels['review_date'], format='%m/%d/%Y')
hotels['review_year'] = hotels['review_date'].dt.year.astype(int)
hotels['review_month'] = hotels['review_date'].dt.month.astype(int)
hotels['days_since_review'] = ((hotels['review_date'].max() - hotels['review_date'])/np.timedelta64(1,'D')).astype(int)
hotels['review_day_of_week'] = hotels['review_date'].dt.dayofweek.astype(int)

hotels = hotels.drop(['review_date'], axis=1)

In [None]:
# The get_weekend(weekday) function takes the elements of the weekday column and
# returns 1 if the day is a holiday and 0 if it is not.

def get_weekend(weekday):
    if weekday == 5 or weekday == 6:
        return 1
    else: 
        return 0

hotels['weekend'] = hotels['review_day_of_week'].apply(get_weekend)

hotels = hotels.drop(['review_day_of_week'], axis=1)

## Geographical locations

In [None]:
# Extract the country, city, and zip code from the hotel address

hotels['city'] = hotels.hotel_address.apply(lambda x: 'London' if x.endswith('United Kingdom') else x.split()[-2])
hotels['country'] = hotels.hotel_address.apply(lambda x: 'United Kingdom' if x.endswith('United Kingdom') else x.split()[-1])
hotels['zip'] = hotels.hotel_address.apply(lambda x: x.split()[-4] + ' ' + x.split()[-3] if x.endswith('United Kingdom') or x.endswith('Netherlands') else x.split()[-3])

In [None]:
# Display where the hotels are located

# create a dataframe with the names and coordinates of the hotels
coordinates = hotels.loc[:, ('hotel_name', 'lat', 'lng')]

# create a map
px.set_mapbox_access_token(
    'pk.eyJ1IjoicnVzczE3NCIsImEiOiJjbDE2ZWlnaGUwMTduM2NwOXY4aTE4bmtvIn0.JoLjc9UsW6b_XBukzS03zQ'
)

fig = px.scatter_mapbox(
    coordinates,
    lat="lat", lon="lng",
    hover_name="hotel_name",
    size_max=15, zoom=3.5,
)
fig.update_layout(
    title={'text': "The map of hotel locations", 'x':0.5}
)
fig.show()

In [None]:
# Leave the generalised zip codes for London

def london_zip_optimiser(zip_code):
    try:
        zip_list = zip_code.split(' ')
        zip_optimised = zip_list[0]
        return zip_optimised
    except IndexError:
        return zip_code

hotels['zip'] = hotels['zip'].apply(london_zip_optimiser)


# group zip codes in the City zones
def london_zip_group(zip_code):
    wc_zip = re.findall(r'^WC', zip_code)
    ec_zip = re.findall(r'^EC', zip_code)
    sw1_zip = re.findall(r'^SW1', zip_code)
    w1_zip = re.findall(r'^W1', zip_code)
    if len(wc_zip) == 1:
        return wc_zip[0]
    elif len(ec_zip) == 1:
        return ec_zip[0]
    elif len(sw1_zip) == 1:
        return sw1_zip[0]
    elif len(w1_zip) == 1:
        return w1_zip[0]
    else:
        return zip_code

hotels['zip'] = hotels['zip'].apply(london_zip_group)

In [None]:
hotels['lat'] = hotels['lat'].fillna(0)
hotels['lng'] = hotels['lng'].fillna(0)

In [None]:
# Fill up the empty lat values with the relevant city coordinates

def lat_fillna(x):
    lat = x[0]
    city = x[1]
    if city == 'Paris' and lat == 0:
        return 48.8566
    elif city == 'Vienna' and lat == 0:
        return 48.2082
    elif city == 'Barcelona' and lat == 0:
        return 41.3874
    else:
        return lat
    
hotels['lat'] = hotels[['lat', 'city']].apply(lat_fillna, axis = 1)

In [None]:
# Fill up the empty lng values with the relevant city coordinates

def lng_fillna(x):
    lng = x[0]
    city = x[1]
    if city == 'Paris' and lng == 0:
        return 2.3522
    elif city == 'Vienna' and lng == 0:
        return 16.3738
    elif city == 'Barcelona' and lng == 0:
        return 2.1686
    else:
        return lng
    
hotels['lng'] = hotels[['lng', 'city']].apply(lng_fillna, axis = 1)

In [None]:
# The Number of Bed-nights in the Cities in 2016 (e.g. popularity)
# Source: European Cities Marketing Benchmarking Report 2017- https://bit.ly/377Z899

bednights = {
    'Paris': 44016074, 'London': 75069660, 'Milan': 11257872, 
    'Vienna': 15760254, 'Barcelona': 19162580, 'Amsterdam': 13834000
}

hotels['city_bednights'] = hotels['city'].map(bednights)


In [None]:
# Leave the top 30 countries

top_reviewer_nationality = list(hotels['reviewer_nationality'].value_counts()[:30].index)
print(top_reviewer_nationality)
hotels['reviewer_nationality'] = hotels['reviewer_nationality'].apply(lambda x: x if x in top_reviewer_nationality else ' Other')

In [None]:
# Convert the reviewer nationalities into numeric values

hotels['reviewer_nationality_encoded'] = LabelEncoder().fit_transform(hotels['reviewer_nationality'])

In [None]:
# Convert the hotel countries into numeric values identical to review nationalities

def encode_some_countries(country):
    if country == 'Austria':
        return 1
    if country == 'France':
        return 5
    if country == 'Italy':
        return 12
    if country == 'Netherlands':
        return 14
    if country == 'Spain':
        return 23
    if country == 'United Kingdom':
        return 28

hotels['hotel_country_encoded'] = hotels['country'].apply(encode_some_countries)

In [None]:
# Create a new feature 'is_domestic' showing if the reviewer is from the same country as the hotel

hotels['is_domestic'] = np.where((hotels['reviewer_nationality_encoded'] == hotels['hotel_country_encoded']), 1, 0)


# drop the unneeded columns

hotels = hotels.drop(['hotel_address', 'reviewer_nationality', 'lng', 'lat', 'hotel_country_encoded', 'country'], axis=1)

## Scores

In [None]:
# Have a look at the average score stats

hotels['average_score'].describe()

In [None]:
# Have a look at the average score histogram

fig = px.histogram(
    data_frame=hotels,
    x='average_score',
    title='Average score distribution',
    histnorm='percent',
    width=500,
    marginal='box', # additional graph
)
fig.show()

In [None]:
# The 5.2 Score looks like an outlier. Let's upgrade it to 6.4

hotels['average_score'] = hotels['average_score'].apply(lambda x: 6.4 if x < 6 else x)

## Tags

In [None]:
# Extract traveller types from the tags

def def_traveller(tag):
    try:
        tag_list = tag.split(',')
        traveller_type = tag_list[1]
        traveller_type = re.sub(r'[\']', '', traveller_type)
        traveller_type = re.sub(r'^\s|\s$', '', traveller_type)
        return traveller_type
    except IndexError:
        return None

hotels['traveller_type'] = hotels['tags'].apply(def_traveller)

In [None]:
# Leave the top 7 most popular values. The rest mark as 'Other'

popular_traveller_type = hotels['traveller_type'].value_counts().nlargest(7).index
print(popular_traveller_type)
hotels['traveller_type'] = hotels['traveller_type'].apply(lambda x: x if x in popular_traveller_type else ' Other')

In [None]:
# Extract trip types from the tags

def def_trip_type(tag):
    tag_list = tag.split(', ')
    trip_type = tag_list[0]
    if 'Leisure trip' in trip_type:
        return 'Leisure trip'
    elif 'Business trip' in trip_type:
        return 'Business trip'
    elif 'Couple' in trip_type:
        return 'Couple'
    elif 'Solo' in trip_type:
        return 'Solo traveler'
    elif 'Family' in trip_type:
        return 'Family'
    elif 'Group' in trip_type:
        return 'Group'
    elif 'pet' in trip_type:
        return 'With a pet'
    elif 'friends' in trip_type:
        return 'Group'
    else:
        return ' Other'

hotels['trip_type'] = hotels['tags'].apply(def_trip_type)

In [None]:
# Extract the length of the stay

def get_stay_length(arg):
    # find all numbers in the pattern 'Stayed D'
    length = re.findall(r'(?<=Stayed )\d+', arg)
    # checking how many numbers we have
    if len(length) == 1:
        # saving the number of days
        return int(length[0])
    else:
        # return '0' in case there is none
        return 0

hotels['stay_length'] = hotels['tags'].apply(get_stay_length)

In [None]:
# Have a look at the length of the stay histogram

fig = px.histogram(
    data_frame=hotels,
    x='stay_length',
    title='Length of stay distribution',
    histnorm='percent',
    width=500,
    marginal='box', # additional graph
)
fig.show()

In [None]:
# Change all the visits longer than 7 days as 8

hotels['stay_length'] = hotels['stay_length'].apply(lambda x: 8 if x > 7 else x)

In [None]:
# Determine if the review was left from a mobile phone

def mob_device(tag):
    tag_list = tag.split(',')
    if 'submitted' in tag_list[-1].lower():
        return 1
    else:
        return 0

hotels['from_mobile'] = hotels['tags'].apply(mob_device)

In [None]:
# Determine the type of rooms based on tags

def room_type(tag):
    try:
        tag_list = tag.split(',')
        trip_type = tag_list[2]
        trip_type = re.sub(r'[\']', '', trip_type)
        trip_type = re.sub(r'^\s|\s$', '', trip_type)
        return trip_type
    except IndexError:
        return None

hotels['room_type'] = hotels['tags'].apply(room_type)

# Optimise room type value
# Group the most common types of rooms

def get_room_type(description):
    description_list = description.split(' ')
    if 'Suite' in description_list:
        return ' Suite'
    elif 'Double' in description_list:
        return ' Double Room'
    elif 'Single' in description_list:
        return ' Single Room'
    elif 'Triple' in description_list:
        return ' Triple Room'
    elif 'Family' in description_list:
        return ' Family Room'
    elif 'children' in description_list:
        return ' Family Room'
    elif 'King' in description_list:
        return ' King Room'
    elif 'Queen' in description_list:
        return ' Queen Room'
    elif 'Twin' in description_list:
        return ' Twin Room'
    elif 'Apartment' in description_list:
        return ' Apartment'
    elif 'Standard' in description_list:
        return ' Standard Room'
    elif 'Deluxe' in description_list:
        return ' Deluxe Room'
    elif 'Rooms' in description_list:
        return ' Several Rooms'
    elif 'rooms' in description_list:
        return ' Several Rooms'
    elif 'Stayed' in description_list:
        return ' Other'
    else:
        return description

# Apply the function to the room_type column
room_types = hotels['room_type'].astype('str').apply(get_room_type)

# Leave the top 12 most popular values. The rest mark as 'Others'
popular_room_types = room_types.value_counts().nlargest(12).index
print(popular_room_types)
hotels['room_type'] = room_types.apply(lambda x: x if x in popular_room_types else 'other')

# We don't need 'tags' anymore
hotels = hotels.drop(['tags'], axis=1)


# Reviews text analysis

In [None]:
# # Stemming the text

# def simple_stemmer(text):
#     ps=nltk.porter.PorterStemmer()
#     text= ' '.join([ps.stem(word) for word in text.split()])
#     return text

# hotels['negative_review']=hotels['negative_review'].apply(simple_stemmer)
# hotels['negative_review']=hotels['negative_review'].apply(simple_stemmer)

In [None]:
# # Remove the stopwords

# # Tokenize the text
# tokenizer=ToktokTokenizer()

# # Set English stopwords
# stopword_list=set(stopwords.words('english'))

# # Function to remove stopwords
# def remove_stopwords(text, is_lower_case=False):
#     tokens = tokenizer.tokenize(text)
#     tokens = [token.strip() for token in tokens]
#     if is_lower_case:
#         filtered_tokens = [token for token in tokens if token not in stopword_list]
#     else:
#         filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
#     filtered_text = ' '.join(filtered_tokens)    
#     return filtered_text

# hotels['negative_review']=hotels['negative_review'].apply(remove_stopwords)
# hotels['negative_review']=hotels['negative_review'].apply(remove_stopwords)

In [None]:
# # Remove special characters

# def remove_special_characters(text, remove_digits=True):
#     pattern=r'[^a-zA-z0-9\s]'
#     text=re.sub(pattern,'',text)
#     return text

# hotels['negative_review']=hotels['negative_review'].apply(remove_special_characters)
# hotels['negative_review']=hotels['negative_review'].apply(remove_special_characters)

In [None]:
# Remove 'No Negative' and similar from NEGATIVE reviews

no_neg_list = ['absolutely nothing', 'all good', "can't think of anything",
               'everything was great', 'everything was perfect',
               'liked everything', 'n a', 'na', 'nil', 'no', 'no complaints',
               'no negative', 'non', 'none', 'nothing at all', 'nothing really',
               'nothing to complain about', 'nothing to dislike'
              ]

hotels['negative_review'] = hotels['negative_review'].str.lower().str.strip()
hotels['negative_review'] = hotels['negative_review'].apply(lambda x: '' if x in no_neg_list else x)

In [None]:
# Visualise the most used words in NEGATIVE reviews

text = hotels.negative_review[0]

stopwords = set(STOPWORDS)
stopwords.update(["hotel", "room"])

wordcloud = WordCloud(stopwords=stopwords).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Most used words in negative reviews",fontsize=18)
plt.axis("off")
plt.show('png')

In [None]:
# Remove 'No Positive' and similar from POSITIVE reviews

no_pos_list = ['absolutely nothing', 'all was bad', "can't think of anything",
               'everything was awful', 'everything was bad',
               "did't like anything", 'n a', 'na', 'nil', 'no', 'nothing',
               'no positive', 'non', 'none',    'nothing at all', 'nothing really',
               'nothing to like', 'nothing that stands out', 'not very much'
              ]

hotels['positive_review'] = hotels['positive_review'].str.lower().str.strip()
hotels['positive_review'] = hotels['positive_review'].apply(lambda x: '' if x in no_pos_list else x)

In [None]:
# Visualise the most used words in POSITIVE reviews

text = hotels.positive_review[0]

stopwords = set(STOPWORDS)
stopwords.update(["hotel", "room"])

wordcloud = WordCloud(stopwords=stopwords,
                      background_color="white",
                     ).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Most used words in positive reviews",fontsize=18)
plt.axis("off")
plt.show('png')

In [None]:
# Analyse reviews

sent_analyzer = SentimentIntensityAnalyzer()

hotels['rw_neg'] = hotels['negative_review'].apply(lambda x: sent_analyzer.polarity_scores(x))
hotels['rw_pos'] = hotels['positive_review'].apply(lambda x: sent_analyzer.polarity_scores(x)) 


# Record the results into the main dataframe as individual features

hotels.loc[:,['n_neg', 'n_neu', 'n_pos', 'n_compound']] = list(hotels['rw_neg'].apply(lambda x: [x['neg'], x['neu'], x['pos'], x['compound']]).values)
hotels.loc[:,['p_neg', 'p_neu', 'p_pos', 'p_compound']] = list(hotels['rw_pos'].apply(lambda x: [x['neg'], x['neu'], x['pos'], x['compound']]).values)

In [None]:
# Create doc2vec vector columns for NEGATIVE reviews

neg_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(hotels["negative_review"].apply(lambda x: x.split(" ")))]

# train a Doc2Vec model with our text data
neg_model = Doc2Vec(neg_documents, vector_size=5, window=2, min_count=1, workers=4)

# transform each document into a vector data
neg_doc2vec_df = hotels["negative_review"].apply(lambda x: neg_model.infer_vector(x.split(" "))).apply(pd.Series)
neg_doc2vec_df.columns = ["neg_doc2vec_vector_" + str(x) for x in neg_doc2vec_df.columns]
hotels = pd.concat([hotels, neg_doc2vec_df], axis=1)

In [None]:
# Create doc2vec vector columns for POSITIVE reviews

pos_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(hotels["positive_review"].apply(lambda x: x.split(" ")))]

# train a Doc2Vec model with our text data
pos_model = Doc2Vec(pos_documents, vector_size=5, window=2, min_count=1, workers=4)

# transform each document into a vector data
pos_doc2vec_df = hotels["positive_review"].apply(lambda x: pos_model.infer_vector(x.split(" "))).apply(pd.Series)
pos_doc2vec_df.columns = ["pos_doc2vec_vector_" + str(x) for x in pos_doc2vec_df.columns]
hotels = pd.concat([hotels, pos_doc2vec_df], axis=1)

In [None]:
# Remove unneeded columns

hotels = hotels.drop(['negative_review', 'positive_review', 'rw_neg', 'rw_pos'], axis=1)

In [None]:
hotels.head(3)

# Assessing the values

In [None]:
# Ordinal Encoder for categories
ord_encoder = ce.OrdinalEncoder(cols=['review_year'])
hotels['review_year'] = ord_encoder.fit_transform(hotels['review_year'])

ord_encoder = ce.OrdinalEncoder(cols=['hotel_name'])
hotels['hotel_name'] = ord_encoder.fit_transform(hotels['hotel_name'])

ord_encoder = ce.OrdinalEncoder(cols=['room_type'])
hotels['room_type'] = ord_encoder.fit_transform(hotels['room_type'])

ord_encoder = ce.OrdinalEncoder(cols=['city'])
hotels['city'] = ord_encoder.fit_transform(hotels['city'])

ord_encoder = ce.OrdinalEncoder(cols=['trip_type'])
hotels['trip_type'] = ord_encoder.fit_transform(hotels['trip_type'])

ord_encoder = ce.OrdinalEncoder(cols=['traveller_type'])
hotels['traveller_type'] = ord_encoder.fit_transform(hotels['traveller_type'])

ord_encoder = ce.OrdinalEncoder(cols=['zip'])
hotels['zip'] = ord_encoder.fit_transform(hotels['zip'])

# Check the multicollinearity

In [None]:
fig, ax = plt.subplots(figsize=(25,15))
sns.heatmap(hotels.corr(), annot=True, ax=ax)

In [None]:
# Remove some multicollinear columns

hotels = hotels.drop(['review_year'], axis=1)

# Assessing the categories

In [None]:
# Categories

cat_cols = ['average_score', 'hotel_name', 
            'reviewer_nationality_encoded', 'room_type',
            'city', 'review_month', 'trip_type', 'zip', 
            'traveller_type', 'weekend', 'is_domestic', 'from_mobile'
            ]

In [None]:
# Chi-squared test

y = hotels.query('sample == 1').drop(['sample'], axis=1).reviewer_score.values.astype('int')
X = hotels.query('sample == 1').drop(['sample'], axis=1)[cat_cols]

plt.rcParams['figure.figsize'] = (15,10)
imp_cat = pd.Series(chi2(X, y)[0], index=cat_cols)
imp_cat.sort_values(inplace = True)
imp_cat.plot(kind = 'barh')

In [None]:
# Numeric features

num_cols = ['stay_length', 'review_total_negative_word_counts',
            'total_number_of_reviews', 
            'review_total_positive_word_counts', 
            'total_number_of_reviews_reviewer_has_given',
            'days_since_review', 'city_bednights',
            'n_neu', 'n_pos', 'n_compound',
            'p_neg', 'p_neu', 'p_compound', 
            'additional_number_of_scoring', 'n_neg', 'p_pos',
            'neg_doc2vec_vector_0', 'neg_doc2vec_vector_1', 
            'neg_doc2vec_vector_2', 'neg_doc2vec_vector_3', 
            'neg_doc2vec_vector_4', 'pos_doc2vec_vector_0', 
            'pos_doc2vec_vector_1', 'pos_doc2vec_vector_2', 
            'pos_doc2vec_vector_3', 'pos_doc2vec_vector_4'
            ]

In [None]:
# Analysis of variance - ANOVA

y = hotels.query('sample == 1').drop(['sample'], axis=1).reviewer_score.values.astype('int')
X = hotels.query('sample == 1').drop(['sample'], axis=1)[num_cols]

imp_num = pd.Series(f_classif(X, y)[0], index = num_cols)
imp_num.sort_values(inplace = True)
imp_num.plot(kind = 'barh')

# Object-type values encoding

In [None]:
hotels.shape

In [None]:
# One Hot Encoder for categories with less than 15 values

encoder = ce.OneHotEncoder(cols=['room_type', 'city', 'review_month', 'trip_type', 'traveller_type'])
cols = encoder.fit_transform(hotels[['room_type', 'city', 'review_month', 'trip_type', 'traveller_type']])
hotels = pd.concat([hotels, cols], axis=1)

hotels = hotels.drop(['room_type', 'city', 'review_month', 'trip_type', 'traveller_type'], axis=1)

In [None]:
# Binary Encoding for categories with more than 15 values

bin_encoder = ce.BinaryEncoder(cols=['hotel_name', 'reviewer_nationality_encoded', 'zip'])
type_bin = bin_encoder.fit_transform(hotels[['hotel_name', 'reviewer_nationality_encoded', 'zip']])
hotels = pd.concat([hotels, type_bin], axis=1)

hotels = hotels.drop(['hotel_name', 'reviewer_nationality_encoded', 'zip'], axis=1)

In [None]:
hotels.shape

# Normalisation

In [None]:
# The numeric feature are distributed abnormally and have some outliers
# Therefore, we use Robust Scaler to avoid the outliers' influence

r_scaler = preprocessing.RobustScaler()
hotels[num_cols] = pd.DataFrame(r_scaler.fit_transform(pd.DataFrame(data = hotels[num_cols])))

In [None]:
hotels.head()

# Nothing is missing

# Set up the training

In [None]:
# Extract the test part

train_data = hotels.query('sample == 1').drop(['sample'], axis=1)
test_data = hotels.query('sample == 0').drop(['sample'], axis=1)

y = train_data.reviewer_score.values            # the target
X = train_data.drop(['reviewer_score'], axis=1)

In [None]:
# Use train_test_split for splitting test data
# Use 20% of the data for validation (test_size parameter)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

In [None]:
# check the shapes of the resulting datasets

test_data.shape, train_data.shape, X.shape, X_train.shape, X_test.shape

In [None]:
# Create a model

model = RandomForestRegressor(n_estimators=100, verbose=1, n_jobs=-1, random_state=RANDOM_SEED)

In [None]:
# Train the model on the test dataset

model.fit(X_train, y_train)

# Use the trained model to predict the rating of restaurants in the test set.
# Record the predicted values into the 'y_pred' variable

y_pred = model.predict(X_test)

In [None]:
# Creat the Mean Absolute Percentage Error (MAPE) function

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
# Compare the predicted values (y_pred) with the real ones (y_test), and assess the average difference
# The Mean Absolute Error (MAE) metric shows the average deviation of the predicted values from the actual ones.

print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MAPE:', mean_absolute_percentage_error(y_test, y_pred))

In [None]:
# Display the most important features for the model using RandomForestRegressor

plt.rcParams['figure.figsize'] = (10,10)
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(15).plot(kind='barh')

In [None]:
test_data = test_data.drop(['reviewer_score'], axis=1)

In [None]:
predict_submission = model.predict(test_data)

In [None]:
sample_submission['reviewer_score'] = predict_submission
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head(10)