In [2]:
import pandas as pd
from feature_engine.imputation import AddMissingIndicator, CategoricalImputer, MeanMedianImputer
from feature_engine.encoding import OneHotEncoder,RareLabelEncoder,CountFrequencyEncoder
from feature_engine.selection import  DropFeatures
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import FunctionTransformer
import json,re
from sklearn.model_selection import train_test_split
from concurrent.futures import ThreadPoolExecutor
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from joblib import Parallel, delayed
#is host in bsas
#

In [3]:
df = pd.read_csv('listings_clean.csv')
df_test = df.sample(frac=0.2, random_state=42)
df_train = df.drop(df_test.index)
del df

In [4]:
X = df_train.drop('price', axis=1)
y = df_train['price']
del df_train
X_test = df_test.drop('price', axis=1)
y_test = df_test['price']
del df_test

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23477 entries, 0 to 29345
Data columns (total 64 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            23477 non-null  int64  
 1   name                                          23477 non-null  object 
 2   description                                   22995 non-null  object 
 3   neighborhood_overview                         12967 non-null  object 
 4   host_id                                       23477 non-null  int64  
 5   host_since                                    23477 non-null  object 
 6   host_location                                 18157 non-null  object 
 7   host_response_time                            20478 non-null  object 
 8   host_response_rate                            20478 non-null  object 
 9   host_acceptance_rate                          21457 non-null  obje

In [6]:
#print all object columns
for col in X.columns:
    if X[col].dtype == 'object':
        print(col)

name
description
neighborhood_overview
host_since
host_location
host_response_time
host_response_rate
host_acceptance_rate
host_is_superhost
host_neighbourhood
host_verifications
host_identity_verified
neighbourhood_cleansed
property_type
room_type
amenities
has_availability
first_review
last_review
instant_bookable


In [10]:
today = pd.to_datetime('today')

def binarize_genres(X,column):
    df_binarized = X[column].apply(lambda x: pd.Series([1] * len(x), index=x)).fillna(0).infer_objects()
    df_binarized = pd.concat([X,df_binarized],axis=1)#.drop(column,axis=1)
    return df_binarized

pipe = Pipeline([
    
    ('countNameLen',FunctionTransformer(lambda x: x.assign(name_len=x['name'].apply(lambda x: len(eval(x)))))),
    
    ('descLen',FunctionTransformer(lambda x: x.assign(description_len=x['description'].apply(lambda x: len(x)) if type(x) == str else 0))),
    ('transformNeighbourhood',FunctionTransformer(lambda x: x.assign(neighborhood_overview = len(x['neighborhood_overview'].str.split('<br/>')) if type(x['neighborhood_overview']) == str else 0))),
    ('descriptionLen',FunctionTransformer(lambda x: x.assign(description = x['description'].apply(lambda x: len(x)) if type(x['description']) == str else 0))),
    ('responseRate',FunctionTransformer(lambda x: x.assign(host_response_rate = x['host_response_rate'].apply(lambda x: float(x[:-1]) if type(x) == str else -99)))),
    ('acceptanceRate',FunctionTransformer(lambda x: x.assign(host_acceptance_rate = x['host_acceptance_rate'].apply(lambda x: float(x[:-1]) if type(x) == str else -99)))),
    ('hostIsSuperhost',FunctionTransformer(lambda x: x.assign(host_is_superhost = x['host_is_superhost'].apply(lambda x: 1 if x == 't' else 0)))),
    #El host vive en buenos aires?
    ('isHostLocationBsAs',FunctionTransformer(lambda x: x.assign(host_location = x['host_location'].apply(lambda x: 1 if type(x) == str and 'Buenos Aires' in x else 0)))),
    ('daysSinceHost',FunctionTransformer(lambda x: x.assign(host_since = x['host_since'].apply(lambda x: (today - pd.to_datetime(x)).days)))),
    ('fillMissings',CategoricalImputer(imputation_method='missing',variables=['host_response_time','host_neighbourhood'])),
    ('responseTimeFrecuency',CountFrequencyEncoder(encoding_method='frequency',variables=['host_response_time'])),
    ('isHostNeigbourhoodSame',FunctionTransformer(lambda x: x.assign(host_neighbourhood = x.apply(lambda x: 1 if x['host_neighbourhood'] == x['neighbourhood_cleansed'] else 0,axis=1)))),
    
    #('verificationsFill',CategoricalImputer(imputation_method='missing',variables=['host_verifications'])),
    ('verificationsList',FunctionTransformer(lambda x: x.assign(host_verifications = x['host_verifications'].apply(lambda x: eval(x))))),
    
    ('verificationsOneHotList',FunctionTransformer(lambda x: binarize_genres(x,'host_verifications'))),
    ('verificationsLen',FunctionTransformer(lambda x: x.assign(host_verifications = x['host_verifications'].apply(lambda x: len(x))if type(x) == list else 0))),
    ('idVerified',FunctionTransformer(lambda x: x.assign(host_identity_verified = x['host_identity_verified'].apply(lambda x: 1 if x == 't' else 0)))),
    
    
    ('fillMissings2',CategoricalImputer(imputation_method='missing',variables=['property_type','room_type'])),
    ('propertyRoomType',CountFrequencyEncoder(encoding_method='frequency',variables=['property_type','room_type','neighbourhood_cleansed'])),


    ("dropFiller",FunctionTransformer(lambda x: x.drop(['id','name'],axis=1))),

    ('dropCat',FunctionTransformer(lambda x: x.drop(['amenities','has_availability','first_review','last_review','instant_bookable'],axis=1))),
])
# Se asume si el host no tiene superhost f o t, se toma como que no es superhost

x2 = pipe.fit_transform(X)
for col in x2.columns:
    if x2[col].dtype == 'object':
        print(col)

In [13]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

regr = XGBRegressor(n_estimators=1000, learning_rate=0.05)
regr.fit(x2, y)
print(regr.score(pipe.transform(X_test),y_test))


-0.0065556337682983745


