# Ski regressor
## Prices may vary

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error

df = pd.read_csv('Shiny_data1.csv', index_col=0)
df = df.drop(df[df.ski_pass_price == 0].index)


## Skiing is expensive.
### It only really starts to sink in when you have to pay for it yourself. 
### I've scraped a bit of ski resort data from ski-resort-stats.com to help me make more informed decisions. Let's see if we can build a model to predict the ski pass price and make better decisions about where to plan our next holiday.

In [10]:
display(df.info())


<class 'pandas.core.frame.DataFrame'>
Int64Index: 501 entries, 0 to 514
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   child_friendly       501 non-null    bool   
 1   continent            501 non-null    object 
 2   country              501 non-null    object 
 3   max_altitude         501 non-null    float64
 4   min_altitude         501 non-null    float64
 5   resort_name          501 non-null    object 
 6   season               501 non-null    object 
 7   ski_pass_price       501 non-null    int64  
 8   url                  501 non-null    object 
 9   beginner_slopes      138 non-null    float64
 10  intermediate_slopes  137 non-null    float64
 11  difficult_slopes     136 non-null    float64
 12  t-bar_lifts          138 non-null    float64
 13  chairlifts           138 non-null    float64
 14  gondolas             136 non-null    float64
 15  snowpark             142 non-null    obj

None

A lot of collumns of different lengths! 
We'll stick with the full length collumns as our data is pretty sparse as it is.

In [2]:
df.drop(['country'], axis = 1, inplace=True)
df.drop(['resort_name'], axis = 1, inplace=True)
df.drop(df.iloc[:, 6:16], axis = 1, inplace=True)
df.drop(df.iloc[:,9:11], axis = 1, inplace=True)
print(df.info())


<class 'pandas.core.frame.DataFrame'>
Int64Index: 501 entries, 0 to 514
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   child_friendly  501 non-null    bool   
 1   continent       501 non-null    object 
 2   max_altitude    501 non-null    float64
 3   min_altitude    501 non-null    float64
 4   season          501 non-null    object 
 5   ski_pass_price  501 non-null    int64  
 6   country_iso     501 non-null    object 
 7   altitude_diff   501 non-null    float64
 8   total_slopes    501 non-null    float64
dtypes: bool(1), float64(4), int64(1), object(3)
memory usage: 29.8+ KB
None


Much better, I suspect we'll get some colinearity problems from altitude-diff and min,max altitude but we'll start with these and see how the model performs.
Let's do some exploratory data analysis.

In [3]:
px.imshow(df.corr(), color_continuous_scale='Agsunset', title="Correlation heatmap of Skidata")

There seems to be a pretty high correlation between min and max altitude as well as between max altitude and the altitude difference which makes sense. My main intuition is that there's a significant relationship between the ski pass price and the altitude difference. 

In [4]:
fig = px.scatter(df,x='altitude_diff', y='ski_pass_price')
fig.show()

The scatter plot shows a correlation between the ski pass price and altitude difference, however there seems to be two distinct distributions. I suspect it's because of disparities in price between Europe and America. Let's plot them seperately and see.

In [5]:
euro = df[df['continent'] == 'Europe'].copy()
america = df[df['continent'] == 'America'].copy()
rest = df[df['continent'] == 'Rest of the world'].copy()

In [6]:
fig = make_subplots(
    rows=2,cols=2,
    subplot_titles=('All', 'Europe', 'America', 'Rest of the World'))

fig.add_trace(go.Scatter(x=df['altitude_diff'], y=df['ski_pass_price'], mode='markers'),row=1,col=1)
fig.add_trace(go.Scatter(x=euro['altitude_diff'], y=euro['ski_pass_price'], mode='markers'),row=1,col=2)
fig.add_trace(go.Scatter(x=america['altitude_diff'], y=america['ski_pass_price'], mode='markers'),row=2,col=1)
fig.add_trace(go.Scatter(x=rest['altitude_diff'], y=rest['ski_pass_price'], mode='markers'),row=2,col=2)

As suspected there seems to be distinct distributions for each 'continent'. This should hopefully be captured by that variable in the regression model. Let's split the data and start modelling!

In [9]:
train, Test = train_test_split(df, train_size = 0.6)
Test, Validate = train_test_split(Test, train_size = 0.5)

In [10]:
f = 'ski_pass_price ~ ' + ' + '.join(df.columns.drop('ski_pass_price'))
model = smf.ols(formula = f, data = train).fit()
model.summary()

0,1,2,3
Dep. Variable:,ski_pass_price,R-squared:,0.775
Model:,OLS,Adj. R-squared:,0.723
Method:,Least Squares,F-statistic:,14.66
Date:,"Thu, 06 May 2021",Prob (F-statistic):,2.13e-52
Time:,18:24:20,Log-Likelihood:,-1120.0
No. Observations:,300,AIC:,2356.0
Df Residuals:,242,BIC:,2571.0
Df Model:,57,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,22.4324,8.659,2.591,0.010,5.375,39.490
child_friendly[T.True],2.4391,9.803,0.249,0.804,-16.870,21.748
continent[T.Europe],-6.7418,5.621,-1.199,0.232,-17.813,4.330
continent[T.Rest of the world],0.7986,3.464,0.231,0.818,-6.025,7.622
season[T.December - April June - August October - November],-6.3112,11.596,-0.544,0.587,-29.154,16.532
season[T.December - March],-4.5143,2.673,-1.689,0.092,-9.779,0.750
season[T.December - May],-0.4950,7.287,-0.068,0.946,-14.849,13.859
season[T.December - depending on snow conditions],0.5925,11.495,0.052,0.959,-22.051,23.236
season[T.July - April],-0.2225,11.519,-0.019,0.985,-22.914,22.468

0,1,2,3
Omnibus:,39.576,Durbin-Watson:,2.331
Prob(Omnibus):,0.0,Jarque-Bera (JB):,227.457
Skew:,0.278,Prob(JB):,4.06e-50
Kurtosis:,7.229,Cond. No.,1.78e+19


There are a lot of predictors in this model especially for the country and season variables. We get a high R-squared but I suspect it won't generalise very well.

In [11]:
predictions = model.predict(Test)

PatsyError: predict requires that you use a DataFrame when predicting from a model
that was created using the formula api.

The original error message returned by patsy is:
Error converting data to categorical: observation with value 'May - September' does not match any of the expected levels (expected: ['December - April', 'December - April June - August October - November', ..., 'depending on snow conditions - depending on snow conditions', 'no report'])
    ski_pass_price ~ child_friendly + continent + max_altitude + min_altitude + season + country_iso + altitude_diff + total_slopes
                                                                                ^^^^^^

There's too many categories and not enough data! Looks like we need to drop Season and Country as predictors.

In [12]:
f = 'ski_pass_price ~ ' + ' + '.join(df.columns.drop(['ski_pass_price','season','country_iso']))
model = smf.ols(formula = f, data = train).fit()
model.summary()

0,1,2,3
Dep. Variable:,ski_pass_price,R-squared:,0.558
Model:,OLS,Adj. R-squared:,0.549
Method:,Least Squares,F-statistic:,61.76
Date:,"Thu, 06 May 2021",Prob (F-statistic):,3.38e-49
Time:,18:24:29,Log-Likelihood:,-1221.4
No. Observations:,300,AIC:,2457.0
Df Residuals:,293,BIC:,2483.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,68.6192,9.124,7.521,0.000,50.662,86.576
child_friendly[T.True],-8.8453,8.648,-1.023,0.307,-25.865,8.174
continent[T.Europe],-34.6220,2.244,-15.427,0.000,-39.039,-30.205
continent[T.Rest of the world],-21.3978,4.027,-5.314,0.000,-29.323,-13.473
max_altitude,0.0051,0.001,6.578,0.000,0.004,0.007
min_altitude,0.0005,0.001,0.419,0.675,-0.002,0.003
altitude_diff,0.0046,0.001,3.164,0.002,0.002,0.007
total_slopes,0.0271,0.009,2.969,0.003,0.009,0.045

0,1,2,3
Omnibus:,26.105,Durbin-Watson:,2.333
Prob(Omnibus):,0.0,Jarque-Bera (JB):,62.601
Skew:,0.402,Prob(JB):,2.55e-14
Kurtosis:,5.088,Cond. No.,3870000000000000.0


Rsquared has gone down as expected and min_altitude is not significant at a=0.05 so we'll drop it.

In [13]:
f = 'ski_pass_price ~ ' + ' + '.join(df.columns.drop(['ski_pass_price','season','country_iso', 'min_altitude']))
model = smf.ols(formula = f, data = df).fit()
model.summary()

0,1,2,3
Dep. Variable:,ski_pass_price,R-squared:,0.577
Model:,OLS,Adj. R-squared:,0.572
Method:,Least Squares,F-statistic:,112.2
Date:,"Thu, 06 May 2021",Prob (F-statistic):,6.019999999999999e-89
Time:,18:24:34,Log-Likelihood:,-2017.6
No. Observations:,501,AIC:,4049.0
Df Residuals:,494,BIC:,4079.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,56.4400,7.285,7.748,0.000,42.127,70.753
child_friendly[T.True],4.0362,6.991,0.577,0.564,-9.700,17.772
continent[T.Europe],-34.9398,1.685,-20.733,0.000,-38.251,-31.629
continent[T.Rest of the world],-23.9408,2.966,-8.070,0.000,-29.769,-18.112
max_altitude,0.0042,0.001,3.793,0.000,0.002,0.006
altitude_diff,0.0060,0.002,3.212,0.001,0.002,0.010
total_slopes,0.0309,0.007,4.336,0.000,0.017,0.045

0,1,2,3
Omnibus:,42.871,Durbin-Watson:,1.63
Prob(Omnibus):,0.0,Jarque-Bera (JB):,99.936
Skew:,0.457,Prob(JB):,1.99e-22
Kurtosis:,4.988,Cond. No.,40800.0


Rsquared has gone up! But child_friendly is no longer significant so let's drop it and continue.

In [14]:
f = 'ski_pass_price ~ ' + ' + '.join(df.columns.drop(['ski_pass_price','season','country_iso', 'child_friendly', 'min_altitude']))
model = smf.ols(formula = f, data = train).fit()
model.summary()

0,1,2,3
Dep. Variable:,ski_pass_price,R-squared:,0.557
Model:,OLS,Adj. R-squared:,0.549
Method:,Least Squares,F-statistic:,73.89
Date:,"Thu, 06 May 2021",Prob (F-statistic):,6.25e-50
Time:,18:24:37,Log-Likelihood:,-1221.9
No. Observations:,300,AIC:,2456.0
Df Residuals:,294,BIC:,2478.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,59.8771,3.195,18.742,0.000,53.589,66.165
continent[T.Europe],-34.5430,2.243,-15.399,0.000,-38.958,-30.128
continent[T.Rest of the world],-20.4136,3.910,-5.221,0.000,-28.109,-12.718
max_altitude,0.0057,0.001,3.821,0.000,0.003,0.009
altitude_diff,0.0039,0.003,1.512,0.132,-0.001,0.009
total_slopes,0.0273,0.009,2.993,0.003,0.009,0.045

0,1,2,3
Omnibus:,24.848,Durbin-Watson:,2.323
Prob(Omnibus):,0.0,Jarque-Bera (JB):,60.743
Skew:,0.37,Prob(JB):,6.45e-14
Kurtosis:,5.076,Cond. No.,13800.0


All the variables are significant so let's see how she performs.

In [15]:
def plot_scatter_and_line(x, scatter_y, line_y, scatter_name, line_name, title, x_title, y_title):

    fig = go.Figure()
    
    fig.add_trace(go.Scatter(
        x=x, y=scatter_y, name=scatter_name, mode="markers"))
    fig.add_trace(go.Scatter(
        x=x, y=line_y, name=line_name))
    fig.update_layout(title=title, xaxis_title=x_title,
        yaxis_title=y_title)
    
    return fig

In [16]:
Test['predictions'] = model.predict(Test)
px.scatter(Test, x= 'predictions', y = 'ski_pass_price')
line_y = Test['ski_pass_price']
plot_scatter_and_line(Test['ski_pass_price'], Test['predictions'], line_y, 'Predictions', 'y=x','Predicted vs true price', 'Pass_px', 'Predicted price')


Seems to be some outliers and quite a bit of variance. Let's calculate the RMSE.

In [27]:
RMSE = np.sqrt(mean_squared_error(Test['predictions'],Test['ski_pass_price']))
mean_pass = Test['ski_pass_price'].mean()
def print_RMSE(rmse, mean):
    print('RMSE: ', round(rmse,2), '\n')
    print('Mean ski pass price: ', round(mean, 2), '\n')
    print('Comparison: ', round(RMSE/mean_pass *100, 2), '%\n')

print_RMSE(RMSE, mean_pass)


RMSE:  12.86 

Mean ski pass price:  44.78 

Comparison:  28.71 %



RMSE is about 25% of the mean price... Not a brilliant model. Let's investigate by plotting the residuals of the model.

In [108]:
line_y = [0] * len(train['ski_pass_price'])
plot_scatter_and_line(train['ski_pass_price'], model.resid, line_y, 'Model residuals', 'y=0','Model residual plot', 'Pass_px', 'Residuals')


Seems like the model didn't catch the two distinct distributions and left a lot of predictability by the wayside! The residuals are pretty linear so the model doesn't seem to be capturing the variance of the data very well. Let's see if we can do better by only modelling the european data.

In [21]:
trainEU, TestEU = train_test_split(euro, train_size = 0.6)
TestEU, ValidateEU = train_test_split(TestEU, train_size = 0.5)

euro_mod = smf.ols(formula= f, data = trainEU).fit()
euro_mod.summary()

0,1,2,3
Dep. Variable:,ski_pass_price,R-squared:,0.476
Model:,OLS,Adj. R-squared:,0.469
Method:,Least Squares,F-statistic:,65.51
Date:,"Thu, 06 May 2021",Prob (F-statistic):,3.6499999999999996e-30
Time:,18:27:12,Log-Likelihood:,-766.98
No. Observations:,220,AIC:,1542.0
Df Residuals:,216,BIC:,1556.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,29.4117,1.638,17.954,0.000,26.183,32.641
max_altitude,-0.0005,0.001,-0.341,0.734,-0.003,0.002
altitude_diff,0.0120,0.002,5.876,0.000,0.008,0.016
total_slopes,0.0191,0.005,3.830,0.000,0.009,0.029

0,1,2,3
Omnibus:,0.194,Durbin-Watson:,2.063
Prob(Omnibus):,0.908,Jarque-Bera (JB):,0.155
Skew:,-0.064,Prob(JB):,0.926
Kurtosis:,2.984,Cond. No.,7680.0


In [22]:
TestEU['predictions'] = euro_mod.predict(TestEU)
px.scatter(TestEU, x= 'predictions', y = 'ski_pass_price')
line_y = TestEU['ski_pass_price']
plot_scatter_and_line(TestEU['ski_pass_price'], TestEU['predictions'], line_y, 'Predictions', 'y=x','Predicted vs true price', 'Pass_px', 'Predicted price')

Seems a bit flatter than it should be. Let's see what the residuals look like.

In [23]:
line_y = [0] * len(TestEU['ski_pass_price'])
plot_scatter_and_line(TestEU['ski_pass_price'], euro_mod.resid, line_y, 'Model residuals', 'y=0','Euro Model residual plot', 'Pass_px', 'Residuals')


Residuals are a lot better, looking independent although homoescadasicity not the best, probably because of the small training set.

In [36]:
RMSE = np.sqrt(mean_squared_error(TestEU['predictions'],TestEU['ski_pass_price']))
mean_pass = TestEU['ski_pass_price'].mean()
print_RMSE(RMSE, mean_pass)

RMSE:  9.19 

Mean ski pass price:  40.72 

Comparison:  22.56 %



RMSE is a lot better than the previous model, we've gone down to about 20% of the mean price. Let's compare our two models on the validation data sets.

In [37]:
Validate['predictions'] = model.predict(Validate)
ValidateEU['predictions'] = model.predict(ValidateEU)

RMSE_all = np.sqrt(mean_squared_error(Validate['predictions'], Validate['ski_pass_price']))
mean_passAll = Validate['ski_pass_price'].mean()
RMSE_EU = np.sqrt(mean_squared_error(ValidateEU['predictions'], ValidateEU['ski_pass_price']))
mean_passEU = ValidateEU['ski_pass_price'].mean()

In [38]:
print_RMSE(RMSE_all, mean_passAll)

RMSE:  12.75 

Mean ski pass price:  49.03 

Comparison:  22.56 %



In [39]:
print_RMSE(RMSE_EU, mean_passEU)

RMSE:  8.94 

Mean ski pass price:  41.53 

Comparison:  22.56 %



The general model doesn't seem to generalise very well, RMSE increases for the validation set. For the 'eurocentric' model RMSE remains about the same so seems to be the better of the two.

All in all we don't seem to have enough data to make a model that produces decent predictions. However sometimes good enough is better than not at all. 