In [131]:
import os
import sys
import warnings

import numpy as np
import pandas as pd
from mizani.formatters import percent_format
from plotnine import *
import statsmodels.api as sm
import statsmodels.formula.api as smf

warnings.filterwarnings("ignore")

# Current script folder
current_path = os.getcwd()

# location folders
func =  "ch00-tech-prep/"
sys.path.append(func)
# Import the prewritten helper functions
from py_helper_functions import *

In [132]:
# Read the CSVs into a variable
price_data = pd.read_csv('hotels-europe_price.csv')
features_data = pd.read_csv('hotels-europe_features.csv')

In [133]:
# merge the two datasets with the key 'hotel_id'
merged_hotel_data = pd.merge(price_data, features_data, on='hotel_id')

In [134]:
merged_hotel_data

Unnamed: 0,hotel_id,price,offer,offer_cat,year,month,weekend,holiday,nnights,scarce_room,...,country,city_actual,rating_reviewcount,center1label,center2label,neighbourhood,ratingta,ratingta_count,distance_alter,accommodation_type
0,1,172,0,0% no offer,2017,11,1,0,1,0,...,Netherlands,Amsterdam,1030.0,City centre,Montelbaanstoren,Amsterdam,4.0,1115.0,3.6,Hotel
1,1,122,1,15-50% offer,2018,1,1,0,1,0,...,Netherlands,Amsterdam,1030.0,City centre,Montelbaanstoren,Amsterdam,4.0,1115.0,3.6,Hotel
2,1,122,1,15-50% offer,2017,12,0,1,1,0,...,Netherlands,Amsterdam,1030.0,City centre,Montelbaanstoren,Amsterdam,4.0,1115.0,3.6,Hotel
3,1,552,1,1-15% offer,2017,12,0,1,4,0,...,Netherlands,Amsterdam,1030.0,City centre,Montelbaanstoren,Amsterdam,4.0,1115.0,3.6,Hotel
4,1,122,1,15-50% offer,2018,2,1,0,1,0,...,Netherlands,Amsterdam,1030.0,City centre,Montelbaanstoren,Amsterdam,4.0,1115.0,3.6,Hotel
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148016,22902,119,1,15-50% offer,2017,11,0,0,1,1,...,Croatia,Zagreb,48.0,City centre,Zagreb City Museum,Zagreb,4.5,86.0,0.6,Hotel
148017,22902,109,1,15-50% offer,2018,4,1,0,1,1,...,Croatia,Zagreb,48.0,City centre,Zagreb City Museum,Zagreb,4.5,86.0,0.6,Hotel
148018,22902,109,1,15-50% offer,2018,3,1,0,1,1,...,Croatia,Zagreb,48.0,City centre,Zagreb City Museum,Zagreb,4.5,86.0,0.6,Hotel
148019,22902,446,1,15-50% offer,2017,12,0,1,4,1,...,Croatia,Zagreb,48.0,City centre,Zagreb City Museum,Zagreb,4.5,86.0,0.6,Hotel


In [135]:
# filter the merged data for 
merged_hotel_data_Athens = merged_hotel_data.loc[(merged_hotel_data["city_actual"] == "Athens") &
                                                 (merged_hotel_data["accommodation_type"] == "Hotel") &
                                                 (merged_hotel_data["month"] == 12) &
                                                 (merged_hotel_data["year"] >= 2017) &
                                                 (merged_hotel_data["weekend"] == 0)]

In [136]:
# Step 1: Identify the number of null values in the 'rating' column
null = merged_hotel_data_Athens['rating'].isnull().sum()
print(f"{null} null values in the 'rating' column are seen")

0 null values in the 'rating' column are seen


In [137]:
# check if the number of data is over 250
merged_hotel_data_Athens.count()

hotel_id              319
price                 319
offer                 319
offer_cat             319
year                  319
month                 319
weekend               319
holiday               319
nnights               319
scarce_room           319
city                  319
distance              319
stars                 319
rating                319
country               319
city_actual           319
rating_reviewcount    319
center1label          319
center2label          319
neighbourhood         319
ratingta              319
ratingta_count        319
distance_alter        319
accommodation_type    319
dtype: int64

In [138]:
# Set to 1 if 'rating' is greater than or equal to 4, and 0 otherwise
merged_hotel_data_Athens['highly_rated'] = merged_hotel_data_Athens['rating'].apply(lambda x: 1 if x >= 4 else 0)
merged_hotel_data_Athens.head()

Unnamed: 0,hotel_id,price,offer,offer_cat,year,month,weekend,holiday,nnights,scarce_room,...,city_actual,rating_reviewcount,center1label,center2label,neighbourhood,ratingta,ratingta_count,distance_alter,accommodation_type,highly_rated
2119,436,522,1,50%-75% offer,2017,12,0,1,4,0,...,Athens,1056.0,City centre,Capsis Cultural Exhibition and Conference Centre,Acropolis,4.5,3903.0,6.9,Hotel,1
2125,436,104,1,50%-75% offer,2017,12,0,1,1,1,...,Athens,1056.0,City centre,Capsis Cultural Exhibition and Conference Centre,Acropolis,4.5,3903.0,6.9,Hotel,1
2129,437,305,1,15-50% offer,2017,12,0,1,4,1,...,Athens,54.0,City centre,Capsis Cultural Exhibition and Conference Centre,Acropolis,4.0,608.0,7.1,Hotel,1
2131,437,81,1,15-50% offer,2017,12,0,1,1,1,...,Athens,54.0,City centre,Capsis Cultural Exhibition and Conference Centre,Acropolis,4.0,608.0,7.1,Hotel,1
2145,440,150,1,75%+ offer,2017,12,0,1,1,0,...,Athens,793.0,City centre,Capsis Cultural Exhibition and Conference Centre,Acropolis,4.0,1661.0,7.1,Hotel,1


In [139]:
# Assuming 'merged_hotel_data' is your DataFrame
# Selecting some potential independent variables (adjust as needed)
X = merged_hotel_data_Athens[['stars', 'distance']]  # Example features
y = merged_hotel_data_Athens['highly_rated']  # Dependent variable

# Adding a constant to the model (intercept)
X = sm.add_constant(X)

# Building the OLS model
reg = sm.OLS(y, X).fit()

# Printing the summary of the model
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:           highly_rated   R-squared:                       0.213
Model:                            OLS   Adj. R-squared:                  0.208
Method:                 Least Squares   F-statistic:                     42.78
Date:                Wed, 06 Dec 2023   Prob (F-statistic):           3.60e-17
Time:                        23:34:41   Log-Likelihood:                -191.79
No. Observations:                 319   AIC:                             389.6
Df Residuals:                     316   BIC:                             400.9
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0220      0.083     -0.266      0.7

In [140]:
from patsy import dmatrices
y, X = dmatrices(
    "highly_rated ~ stars + distance",
    merged_hotel_data_Athens,
)

logit = sm.GLM(y, X, family=sm.families.Binomial(link=sm.genmod.families.links.logit()))
logit = logit.fit()


In [141]:
logit.summary()

0,1,2,3
Dep. Variable:,highly_rated,No. Observations:,319.0
Model:,GLM,Df Residuals:,316.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-182.49
Date:,"Wed, 06 Dec 2023",Deviance:,364.98
Time:,23:34:41,Pearson chi2:,319.0
No. Iterations:,3,Pseudo R-squ. (CS):,0.2076
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.5197,0.451,-5.584,0.000,-3.404,-1.635
stars,1.0267,0.138,7.441,0.000,0.756,1.297
distance,-0.6091,0.268,-2.271,0.023,-1.135,-0.084


In [142]:
merged_hotel_data_Athens["pred_logit"] = logit.predict()
merged_hotel_data_Athens["pred_logit"].describe()

count    319.000000
mean       0.548589
std        0.231435
min        0.108895
25%        0.325782
50%        0.563604
75%        0.750214
max        0.919158
Name: pred_logit, dtype: float64

In [143]:
sm.Logit(y, X).fit().summary()

Optimization terminated successfully.
         Current function value: 0.572071
         Iterations 5


0,1,2,3
Dep. Variable:,highly_rated,No. Observations:,319.0
Model:,Logit,Df Residuals:,316.0
Method:,MLE,Df Model:,2.0
Date:,"Wed, 06 Dec 2023",Pseudo R-squ.:,0.169
Time:,23:34:41,Log-Likelihood:,-182.49
converged:,True,LL-Null:,-219.61
Covariance Type:,nonrobust,LLR p-value:,7.608e-17

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.5197,0.451,-5.584,0.000,-3.404,-1.635
stars,1.0267,0.138,7.441,0.000,0.756,1.297
distance,-0.6091,0.268,-2.271,0.023,-1.135,-0.084


In [144]:
logit_margef = sm.Logit(y, X).fit().get_margeff()
logit_margef.summary()

Optimization terminated successfully.
         Current function value: 0.572071
         Iterations 5


0,1
Dep. Variable:,highly_rated
Method:,dydx
At:,overall

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
stars,0.1994,0.017,11.907,0.0,0.167,0.232
distance,-0.1183,0.051,-2.337,0.019,-0.218,-0.019


In [145]:
probit = sm.Probit(y, X)
probit_result = probit.fit()

Optimization terminated successfully.
         Current function value: 0.572170
         Iterations 5


In [146]:
probit_result.summary()

0,1,2,3
Dep. Variable:,highly_rated,No. Observations:,319.0
Model:,Probit,Df Residuals:,316.0
Method:,MLE,Df Model:,2.0
Date:,"Wed, 06 Dec 2023",Pseudo R-squ.:,0.1689
Time:,23:34:41,Log-Likelihood:,-182.52
converged:,True,LL-Null:,-219.61
Covariance Type:,nonrobust,LLR p-value:,7.854e-17

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.5166,0.263,-5.768,0.000,-2.032,-1.001
stars,0.6177,0.078,7.918,0.000,0.465,0.771
distance,-0.3518,0.155,-2.266,0.023,-0.656,-0.048


In [147]:
merged_hotel_data_Athens["pred_probit"] = probit_result.predict()
merged_hotel_data_Athens["pred_probit"].describe()

count    319.000000
mean       0.551709
std        0.230021
min        0.105511
25%        0.335093
50%        0.571453
75%        0.749406
max        0.928704
Name: pred_probit, dtype: float64

In [148]:
probit_margef_results = probit_result.get_margeff()
probit_margef_results.summary()

0,1
Dep. Variable:,highly_rated
Method:,dydx
At:,overall

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
stars,0.2006,0.017,11.856,0.0,0.167,0.234
distance,-0.1142,0.049,-2.315,0.021,-0.211,-0.018
