In [61]:
import pandas as pd
import numpy as np


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import pickle

from lightgbm import LGBMRegressor
from sklearn.preprocessing import OneHotEncoder

In [135]:
train_df = pd.read_csv('train.csv')


In [136]:
property_type_allowed = ['Apartment','House','Townhouse','Serviced apartment',
                         'Condominium','Loft','Bed and breakfast','Guest suite','Guesthouse',
                         'Other','Hostel','Boutique hotel','Hotel','Bungalow','Cottage','Villa',
                         'Boat','Aparthotel']
train_df.loc[~train_df["property_type"].isin(property_type_allowed), "property_type"] = "Other"

cat_features = ['neighbourhood_cleansed', 'property_type', 'room_type', 'cancellation_policy', 
                'require_guest_phone_verification']

int_features = ['accommodates', 'bedrooms', 'beds', 'security_deposit']

features = ['accommodates', 'neighbourhood_cleansed', 'property_type', 'room_type', 'cancellation_policy', 
            'bedrooms', 'beds', 'require_guest_phone_verification', 'security_deposit'
           ]

In [137]:
def MAPE(y_true, y_pred):
    return np.mean(np.abs((y_true-y_pred) / (y_true)).replace([-np.inf, np.inf], np.nan).dropna())*100

In [138]:
train_df = train_df.loc[(train_df.price < 1500)&(train_df.price > 0)]
train_df.reset_index(inplace = True)

In [139]:
encoder = OneHotEncoder()
encoder.fit(train_df[cat_features])

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [142]:
tmp_df = encoder.transform(train_df[cat_features])
cat_df = pd.DataFrame(tmp_df.toarray())

In [143]:
result = pd.concat([train_df[int_features], cat_df], axis=1, sort=False)

In [144]:
result.fillna(0, inplace = True)
result['bedrooms'] = result['bedrooms'].astype('int')
result['beds'] = result['beds'].astype('int')
result['security_deposit'] = result['security_deposit'].astype('int')

In [145]:
x_train = result
y_train = train_df['price']  

X_tr, X_tst, y_tr, y_tst =  train_test_split(x_train, y_train, random_state=42)

In [146]:
model = LGBMRegressor(objective='mae', n_estimators=150)
model.fit(X_tr, np.log1p(y_tr))

preds = np.expm1(model.predict(X_tst))
MAPE(y_tst, preds)

29.541465697953846

In [147]:
with open('price_model.pkl', 'wb') as output_file:
    pickle.dump(model, output_file)
    
with open('one_hot_encoder.pkl', 'wb') as output_file:
    pickle.dump(encoder, output_file)