In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from scipy.ndimage import gaussian_filter

In [2]:
output_folder = './output/positioning'

In [3]:
df = pd.read_csv('data/Riva_del_garda_hotels.csv', sep=',', index_col=0)
df_revenue = df[['tot_revenue_euro']].copy()
df.drop(columns=['tot_revenue_euro', 'city','%_option_not_specified','%_option_all_inclusive'], inplace=True, errors='ignore')
df

Unnamed: 0_level_0,%_option_breakfast,%_option_half_board,%_option_full_board,%_option_room_only,%_market_business,%_market_business_groups,%_market_leisure_groups,%_market_retail,%_market_other,average_length_of_stay,...,%_march_booking,%_april_booking,%_may_booking,%_june_booking,%_july_booking,%_august_booking,%_september_booking,%_october_booking,%_november_booking,%_december_booking
hotel_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
hotel_1,0.059844,0.940156,0.0,0.0,0.0,0.0,0.0,0.003469,0.996531,5.816999,...,0.0,0.141835,0.164131,0.134476,0.114384,0.112977,0.151623,0.162013,0.018561,0.0
hotel_2,0.95791,0.028514,0.009492,0.004084,0.0,0.038521,0.0,0.925497,0.035982,2.438521,...,0.119318,0.000315,0.000442,0.004716,0.09543,0.08775,0.099757,0.111931,0.134283,0.12055
hotel_3,0.902308,0.0,0.0,0.097692,0.02488,0.0,0.038347,0.928555,0.008217,2.034239,...,0.081446,0.076866,0.075465,0.095363,0.089433,0.082643,0.110313,0.095537,0.067757,0.088051
hotel_4,0.773651,0.226349,0.0,0.0,0.0,0.0,0.027906,0.972094,0.0,3.614598,...,0.028664,0.113006,0.116136,0.139444,0.125194,0.116292,0.129473,0.147667,0.074912,0.0
hotel_5,0.873439,0.125729,0.0,0.000833,0.020816,0.0,0.0,0.935054,0.04413,3.306411,...,0.066125,0.108718,0.111234,0.125013,0.13203,0.135314,0.132493,0.110749,0.019206,0.0
hotel_6,0.540786,0.450341,0.008873,0.0,0.233415,0.0,0.1724,0.594185,0.0,4.261093,...,0.0,0.111182,0.139721,0.16113,0.152377,0.163916,0.168321,0.103353,0.0,0.0
hotel_7,0.951606,0.0,0.0,0.048394,0.091927,0.016484,0.0,0.121302,0.770287,2.752959,...,0.022424,0.101783,0.107317,0.103319,0.090832,0.084324,0.104498,0.129441,0.079316,0.105844
hotel_8,0.835665,0.163328,0.001007,0.0,0.000408,0.032811,0.000136,0.966644,0.0,3.02583,...,0.062764,0.110346,0.105382,0.126049,0.114114,0.110459,0.144687,0.118916,0.0,0.058952
hotel_9,0.872622,0.118886,0.0,0.008492,0.002717,0.057405,0.161685,0.778193,0.0,2.898098,...,0.0,0.076855,0.106067,0.150557,0.16473,0.165266,0.153468,0.076739,0.059103,0.047215
hotel_10,0.811709,0.149456,0.0,0.038835,0.002354,0.035893,0.128861,0.784643,0.048249,4.458959,...,0.0,0.15271,0.130794,0.139801,0.118813,0.114285,0.139719,0.162013,0.018917,0.0


In [4]:
df_revenue

Unnamed: 0_level_0,tot_revenue_euro
hotel_id,Unnamed: 1_level_1
hotel_1,237657.7
hotel_2,1410631.0
hotel_3,441777.8
hotel_4,590661.6
hotel_5,379540.7
hotel_6,212423.2
hotel_7,671980.1
hotel_8,1250294.0
hotel_9,426915.3
hotel_10,437452.5


In [5]:
df_revenue.describe()

Unnamed: 0,tot_revenue_euro
count,27.0
mean,528634.3
std,809580.1
min,77.27
25%,169897.3
50%,305804.5
75%,600487.5
max,4186005.0


In [6]:
# scale with StandardScaler
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

In [7]:
# reduce dimensionality with PCA
pca = PCA(n_components=0.95)
df_pca = pd.DataFrame(pca.fit_transform(df), columns=['PC'+str(components) for components in range(len(pca.explained_variance_ratio_))], index=df.index)
df_pca

Unnamed: 0_level_0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11
hotel_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
hotel_1,-2.692694,-4.888863,1.925986,-0.991145,-3.232849,3.779305,-0.528632,-0.39723,0.187279,-1.224275,-0.346987,-0.115336
hotel_2,1.414481,7.074682,1.587043,-1.329944,-1.353886,0.974913,-0.012517,-0.902916,0.425446,-0.096174,1.421514,-0.568776
hotel_3,1.260736,4.806497,0.405137,-0.812561,0.60756,0.63792,0.994056,-0.364419,0.067105,-0.146567,0.10095,-0.025591
hotel_4,-1.034111,0.584663,0.271027,1.209531,-0.457623,0.345469,-0.293198,-0.056251,-0.338259,-0.964932,-0.81322,0.540452
hotel_5,-0.974205,0.874307,0.486883,0.483648,0.166219,-0.604605,0.086586,0.381978,0.820528,-1.419462,-0.602802,-1.204244
hotel_6,-1.36023,-1.582185,-0.90064,2.603425,1.1828,1.579556,2.492698,0.300205,2.470659,2.157868,-0.926006,-1.009347
hotel_7,0.627998,2.448119,0.588994,-0.906544,0.542102,1.151216,-0.865515,-0.588029,1.890397,1.558228,-0.010271,1.752899
hotel_8,-0.060201,1.72781,0.134494,0.203185,-0.138204,-0.191485,-0.303423,0.118476,-0.019348,-0.507105,-0.877418,-0.925842
hotel_9,-1.369207,-0.122314,-0.259128,1.709216,0.563204,-0.319663,0.121956,-0.455838,-1.496356,1.028129,1.747376,-0.143836
hotel_10,-1.458019,-0.690237,0.530489,1.625582,-0.250265,-0.634865,-0.085065,-0.973105,0.009069,-0.727955,-0.218395,0.381472


In [8]:

# plot the PCA
fig = px.scatter(df_pca, x='PC0', y='PC1', text=df_pca.index,
                 title='PCA of Hotels in Riva Del Garda',
                 labels={'PC0': 'Component 1', 'PC1': 'Component 2'},
                 color_discrete_sequence=px.colors.qualitative.Plotly, template='plotly_white')
fig.update_traces(textposition='top center')
fig.update_layout(height=600, xaxis_title='Component 0', yaxis_title='Component 1')
fig.show()

In [9]:
# print the explained variance ratio
print(f'Explained variance ratio: {pca.explained_variance_ratio_}')

Explained variance ratio: [0.30254867 0.19351941 0.10209631 0.074944   0.05812303 0.04537237
 0.04283411 0.0370841  0.02982379 0.02558893 0.02150147 0.01813399]


In [10]:
# on the pca df train a linear regression model to predict df_revenue
model = RandomForestRegressor()
model.fit(df_pca[df_pca.columns[:2]], df_revenue)

# generate a grid of points to predict the revenue and then use as heatmap
x_min, x_max = df_pca['PC0'].min()-1, df_pca['PC0'].max()+1
y_min, y_max = df_pca['PC1'].min()-1, df_pca['PC1'].max()+1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                     np.linspace(y_min, y_max, 100))
grid_points = np.c_[xx.ravel(), yy.ravel()]
grid_df = pd.DataFrame(grid_points, columns=['PC0', 'PC1'])
# predict the revenue for the grid points
grid_revenue = model.predict(grid_df)
grid_revenue = grid_revenue.reshape(xx.shape)
# smooth the grid revenue
grid_revenue = gaussian_filter(grid_revenue, sigma=20)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [11]:
# plot the heatmap
fig = px.imshow(grid_revenue, x=np.linspace(x_min, x_max, 100),
                y=np.linspace(y_min, y_max, 100),
                labels={'x': 'Principal Component 0', 'y': 'Principal Component 1', 'color': 'Revenue'},
                title='Predicted Revenue Heatmap',
                color_continuous_scale='Viridis',
                aspect='auto')
fig.update_layout(height=600, width=800)
fig.show()

In [12]:
# plot the scatter plot over the heatmap, use go objects to add the scatter plot
import plotly.graph_objects as go
fig = go.Figure(data=go.Heatmap(z=grid_revenue, x=np.linspace(x_min, x_max, 100),
                                 y=np.linspace(y_min, y_max, 100),
                                 colorscale='BuGn',
                                 colorbar=dict(title='Revenue')))
fig.add_trace(go.Scatter(x=df_pca['PC0'], y=df_pca['PC1'],
                         mode='markers+text',
                         text=df_pca.index,
                         textposition='top center',
                         marker=dict(color='red', size=5, opacity=0.7),
                         name='Hotels'))
fig.update_layout(title='Predicted Revenue Heatmap on latent space with Hotels',
                  xaxis_title='Component 0',
                  yaxis_title='Component 1',
                  height=700, width=1200)
fig.show()
fig.write_html(f'{output_folder}/strategic_positioning.png')

In [13]:
# analyze the PCA components
pca_components = pd.DataFrame(pca.components_, columns=df.columns, index=['PC'+str(i) for i in range(len(pca.components_))])
pca_components = pca_components.T
pca_components['abs'] = pca_components.abs().sum(axis=1)
pca_components = pca_components.sort_values(by='abs', ascending=False)
pca_components = pca_components.drop(columns='abs')
pca_components

Unnamed: 0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11
%_market_business,-0.024799,-0.002884,-0.03317,0.185742,0.132584,0.223039,0.264698,-0.017508,0.57113,0.552975,-0.23792,-0.110512
%_market_business_groups,-0.043633,0.059876,0.039187,0.210529,-0.184806,-0.138068,-0.377428,-0.391715,-0.188269,0.43946,0.079527,-0.200429
%_october_booking,-0.176246,0.062857,-0.116512,0.12736,-0.26735,-0.002459,-0.043595,-0.124937,0.275762,-0.267985,-0.098171,0.51068
%_market_retail,-0.066952,0.243737,-0.01241,0.259891,-0.13607,-0.207323,0.239727,-0.067096,-0.241595,-0.148519,-0.292979,-0.155936
average_IT_per_booking,0.087518,0.149376,-0.182198,-0.357901,0.099876,0.115407,0.03703,-0.155848,-0.263746,0.14011,-0.364988,0.042727
%_option_full_board,-0.001224,0.062438,0.042833,0.061884,-0.275925,-0.043807,0.101289,0.693499,-0.195128,0.237432,-0.062592,0.192836
%_market_leisure_groups,-0.075141,-0.061109,-0.046918,0.42489,0.147561,0.075228,0.341753,-0.159842,-0.126468,0.016336,0.387147,0.098333
%_market_other,0.080225,-0.216305,0.019982,-0.347123,0.098935,0.163757,-0.277355,0.121043,0.198167,0.042004,0.218528,0.152054
average_minority_foreigners_per_booking,-0.125205,0.076119,-0.072275,0.200237,-0.138267,-0.460011,-0.247154,0.090925,0.376613,-0.04849,0.02736,-0.042885
%_december_booking,-0.00407,0.261437,0.038005,-0.175239,-0.233519,-0.034357,0.087669,-0.029787,0.050088,0.324133,0.318502,0.261468


In [14]:
# visualize the PCA components 0 and 1
fig = px.bar(pca_components.round(2), x=pca_components.round(2).index, y='PC0',
             title='Component 0 Contributions', text='PC0', color='PC0', color_continuous_scale='Bluered', color_continuous_midpoint=0,
             labels={'x': 'Features', 'y': 'Contribution'},
             color_discrete_sequence=px.colors.qualitative.Plotly, template='plotly_white')
fig.update_layout(height=600, xaxis_title='Features', yaxis_title='Contribution')
fig.show()
fig.write_html(f'{output_folder}/pca_component_0_contributions.png')

In [15]:
fig = px.bar(pca_components.round(2), x=pca_components.round(2).index, y='PC1',
             title='Component 1 Contributions', text='PC1', color='PC1', color_continuous_scale='Bluered', color_continuous_midpoint=0,
             labels={'x': 'Features', 'y': 'Contribution'},
             color_discrete_sequence=px.colors.qualitative.Plotly, template='plotly_white')
fig.update_layout(height=600, xaxis_title='Features', yaxis_title='Contribution')
fig.write_html(f'{output_folder}/pca_component_1_contributions.png')
fig.show()