In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

  from pandas import MultiIndex, Int64Index


In [2]:
df = pd.read_csv("data/Hotel in India.csv")
df.head()

Unnamed: 0,Hotel Name,Rating,Rating Description,Reviews,Star Rating,Location,Nearest Landmark,Distance to Landmark,Price,Tax
0,"Taj Club House, Chennai",4.3,Excellent,2000.0,5.0,Royapettah,,,6390,767.0
1,The Park Chennai,3.7,Very Good,8366.0,5.0,Near US Consulate,,,7198,2167.0
2,"Taj Fisherman's Cove Resort & Spa, Chennai",4.3,Excellent,1856.0,5.0,Covelong,city centre,36.3 km,12555,2260.0
3,Hotel Savera,4.0,Very Good,7458.0,4.0,Mylapore,,,4612,1091.0
4,Ibis Chennai OMR - An Accor Brand,4.1,Very Good,2545.0,4.0,Sholinganallur,,,3345,


In [3]:
df = df.copy()

In [4]:
df.columns = df.columns.str.replace(' ', '_')
df.head()

Unnamed: 0,Hotel_Name,Rating,Rating_Description,Reviews,Star_Rating,Location,Nearest_Landmark,Distance_to_Landmark,Price,Tax
0,"Taj Club House, Chennai",4.3,Excellent,2000.0,5.0,Royapettah,,,6390,767.0
1,The Park Chennai,3.7,Very Good,8366.0,5.0,Near US Consulate,,,7198,2167.0
2,"Taj Fisherman's Cove Resort & Spa, Chennai",4.3,Excellent,1856.0,5.0,Covelong,city centre,36.3 km,12555,2260.0
3,Hotel Savera,4.0,Very Good,7458.0,4.0,Mylapore,,,4612,1091.0
4,Ibis Chennai OMR - An Accor Brand,4.1,Very Good,2545.0,4.0,Sholinganallur,,,3345,


In [5]:
df.isna().sum()

Hotel_Name                0
Rating                    1
Rating_Description        1
Reviews                   1
Star_Rating              95
Location                  0
Nearest_Landmark        290
Distance_to_Landmark    290
Price                     0
Tax                     176
dtype: int64

In [6]:
df = df.drop(columns=['Hotel_Name', 'Rating_Description'])
df.head()

Unnamed: 0,Rating,Reviews,Star_Rating,Location,Nearest_Landmark,Distance_to_Landmark,Price,Tax
0,4.3,2000.0,5.0,Royapettah,,,6390,767.0
1,3.7,8366.0,5.0,Near US Consulate,,,7198,2167.0
2,4.3,1856.0,5.0,Covelong,city centre,36.3 km,12555,2260.0
3,4.0,7458.0,4.0,Mylapore,,,4612,1091.0
4,4.1,2545.0,4.0,Sholinganallur,,,3345,


In [7]:
df[['Distance_Landmark', 'B']] = df['Distance_to_Landmark'].str.split(' ', 1, expand=True)
df.head()

Unnamed: 0,Rating,Reviews,Star_Rating,Location,Nearest_Landmark,Distance_to_Landmark,Price,Tax,Distance_Landmark,B
0,4.3,2000.0,5.0,Royapettah,,,6390,767.0,,
1,3.7,8366.0,5.0,Near US Consulate,,,7198,2167.0,,
2,4.3,1856.0,5.0,Covelong,city centre,36.3 km,12555,2260.0,36.3,km
3,4.0,7458.0,4.0,Mylapore,,,4612,1091.0,,
4,4.1,2545.0,4.0,Sholinganallur,,,3345,,,


In [8]:
df = df.drop(columns=['Distance_to_Landmark','B'])

In [9]:
df.head()

Unnamed: 0,Rating,Reviews,Star_Rating,Location,Nearest_Landmark,Price,Tax,Distance_Landmark
0,4.3,2000.0,5.0,Royapettah,,6390,767.0,
1,3.7,8366.0,5.0,Near US Consulate,,7198,2167.0,
2,4.3,1856.0,5.0,Covelong,city centre,12555,2260.0,36.3
3,4.0,7458.0,4.0,Mylapore,,4612,1091.0,
4,4.1,2545.0,4.0,Sholinganallur,,3345,,


In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from jcopml.tuning import random_search_params as rsp
from xgboost import XGBRegressor

In [11]:
df.Tax = df.Tax.fillna(0)
df.Price = df.Price.fillna(0)
df.Distance_Landmark = df.Distance_Landmark.fillna(0)
df.Nearest_Landmark = df.Nearest_Landmark.fillna('Unknown')

df['Tax'] = df['Tax'].str.replace(',', '.')
df['Price'] = df['Price'].str.replace(',', '.')

df.Tax = df.Tax.astype(float)
df.Distance_Landmark = df.Distance_Landmark.astype(float)
df.Price = df.Price.astype(float)

df.head()

Unnamed: 0,Rating,Reviews,Star_Rating,Location,Nearest_Landmark,Price,Tax,Distance_Landmark
0,4.3,2000.0,5.0,Royapettah,Unknown,6.39,767.0,0.0
1,3.7,8366.0,5.0,Near US Consulate,Unknown,7.198,2.167,0.0
2,4.3,1856.0,5.0,Covelong,city centre,12.555,2.26,36.3
3,4.0,7458.0,4.0,Mylapore,Unknown,4.612,1.091,0.0
4,4.1,2545.0,4.0,Sholinganallur,Unknown,3.345,,0.0


In [12]:
X = df.drop(columns="Price")
y = df.Price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((464, 7), (116, 7), (464,), (116,))

In [13]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(poly=2, impute='mean'), ["Rating",'Reviews','Star_Rating','Distance_Landmark','Tax']),
    ('categoric', cat_pipe(encoder='onehot'), ["Location",'Nearest_Landmark']),
])
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo',XGBRegressor(n_jobs=-1, random_state=42))
])


model = RandomizedSearchCV(pipeline, rsp.xgb_poly_params, cv=2, n_iter=150, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 2 folds for each of 150 candidates, totalling 300 fits
{'algo__colsample_bytree': 0.5687508340232413, 'algo__gamma': 8, 'algo__learning_rate': 0.48872853588355797, 'algo__max_depth': 2, 'algo__n_estimators': 152, 'algo__reg_alpha': 0.22233337605920384, 'algo__reg_lambda': 7.2614145160288395, 'algo__subsample': 0.6035171238433423, 'prep__numeric__poly__degree': 1, 'prep__numeric__poly__interaction_only': True}
0.9702025135258289 0.6078709583994836 0.6852232059253309
