In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import  preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.cross_validation import KFold
import matplotlib.pyplot as plt
import re
from sklearn.cluster import KMeans

In [37]:
#try xgboost
#fucntion from SRK
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, \
     seed_val=0, early_stop = 20,num_rounds=10000, eta = 0.1,\
     max_depth = 6,cv_dict = None,verbose_eval=True):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = eta
    param['max_depth'] = max_depth
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y,feature_names=feature_names)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y,feature_names=feature_names)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist,\
        early_stopping_rounds=early_stop,evals_result = cv_dict,verbose_eval = verbose_eval)
    else:
        xgtest = xgb.DMatrix(test_X,feature_names=feature_names)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

class CVstatistics(object):
    
    """
    self.result : the result dataframe storing the cv results
    self.endpoint : the first ending point for the validations
    self.turns: the turns for each validation
    
    validCurve : plot the validation curve,stop at the first endpoint
    """
    def __init__(self,result_dict,metric,k=5):
        self.metric = metric
        if type(result_dict) == pd.DataFrame:
            self.result = result_dict
        else:
            tempDict = {}
            for phase in ['train','test']:
                for turn in range(k):
                    tempDict[phase+str(turn)]=cv_result[turn][phase][metric]
                    self.result=pd.DataFrame(dict([ (key,pd.Series(v)) for key,v in tempDict.iteritems()]))    
        
        self.endpoint =len(self.result.filter(like = 'train').dropna())
        
        self.turns = self.result.filter(like = 'test').\
            apply(lambda x : ~np.isnan(x)).cumsum(axis=0).iloc[len(self.result)-1,:]

    def validCurve(self,start=0,stop_at_first = True):
        if stop_at_first:
            eout = self.result.filter(like = 'test').dropna().mean(axis=1)
            ein =  self.result.filter(like = 'train').dropna().mean(axis=1)
        else:
            eout = self.result.filter(like = 'test').mean(axis=1)
            ein =  self.result.filter(like = 'train').mean(axis=1)
        plt.plot(range(len(eout)), eout,
        range(len(ein)), ein)
        plt.xlabel("turn")
        plt.ylabel(self.metric)
        plt.title('Validation Curve')
        
        plt.show()
    
    def errorsAt(self,turn):
        eout = self.result.filter(like = 'test').loc[turn].mean()
        ein = self.result.filter(like = 'train').loc[turn].mean()
        return eout,ein
    

def showImportance(model,factor_name):
    factors = model.get_score(importance_type=factor_name)
    factor_list = []
    total = sum(factors.values())
    for key in factors:
        factors[key] = factors[key]*1.0/total
        factor_list.append((key,factors[key]))
    return sorted(factor_list,key=lambda x : x[1],reverse=True)

def showFscore(model,normalize = True):
    factors = model.get_fscore()
    factor_list = []
    total = sum(factors.values())
    for key in factors:
        if normalize:
            factors[key] = factors[key]*1.0/total
        else:
            factors[key] = factors[key]
        factor_list.append((key,factors[key]))
    return sorted(factor_list,key=lambda x : x[1],reverse=True)
    
#feature processing functions
def proecessStreet(address):
    #remove the building number
    pattern = re.compile('^[\d-]*[\s]+')
    street = removePunctuation(pattern.sub('',address))
    
    #sub the st to street
    pattern = re.compile('( st)$')
    street = pattern.sub(' street',street)
    
    #sub the ave to avenue
    pattern = re.compile('( ave)$')
    street = pattern.sub(' avenue',street)
    
    pattern = re.compile('(\d+)((th)|(st)|(rd)|(nd))')
    street = pattern.sub('\g<1>',street)
    
    #deal with the w 14 street => west 14 street
    pattern = re.compile('(w)(\s+)(\d+)')    
    street = pattern.sub('west \g<3>',street)
    
    #deal with the e....
    pattern = re.compile('(e)(\s+)(\d+)')    
    street = pattern.sub('east \g<3>',street)
    
    return street

In [3]:
def processMap(df):
    for i in ['latitude', 'longitude']:
        Q1 = df[i].quantile(0.005)
        Q3 = df[i].quantile(0.995)
        IQR = Q3 - Q1
        upper = Q3
        lower = Q1
        df.ix[(df[i]>upper)|(df[i]<lower),i] = np.nan
        #df.ix[:,i] =  df[i].round(3) 
    return 

In [4]:
def getCluster(train_df,test_df,k):
    cluster = KMeans(k,random_state = 2333)
    cluster.fit(train_df[['latitude', 'longitude']].dropna())
    train_df['cluster_id_'+str(k)]=map(lambda x,y: cluster.predict(np.array([x,y]).reshape(1,-1))[0] \
                           if ~(np.isnan(x)|np.isnan(y)) else -1,\
                           train_df['latitude'],train_df['longitude'])
    test_df['cluster_id_'+str(k)]=map(lambda x,y: cluster.predict(np.array([x,y]).reshape(1,-1))[0] \
                           if ~(np.isnan(x)|np.isnan(y)) else -1,\
                           test_df['latitude'],test_df['longitude'])

In [5]:
def categorical_statistics(train_df,test_df,cf,nf,update_df = None,\
                           get_mean=True,get_std=True,get_median=True,get_min = True,get_max = True,\
                           get_size = True,get_normalized_in_group = True):
    statistics ={}
    if get_mean:
        statistics['mean']='mean'
    if get_max:
        statistics['max']='max'
    if get_min:
        statistics['min']='min'
    if get_std:
        statistics['std']='std'
    if get_median:
        statistics['median']='median'
    if get_size:
        statistics['size']='size'
        
    values = train_df.groupby(cf)[nf].agg(statistics)
    values = values.add_prefix(cf+'_'+nf+'_')
    
    new_feature = list(values.columns)
    
    #consider using -1 for others
    updateM = test_df[[cf]].join(values, on = cf, how="left")[new_feature]#.fillna(-1)
    
    if update_df is None: update_df = test_df
    
    for f in new_feature:
        if f not in update_df.columns: 
            update_df[f] = np.nan
    #update the statistics excluding the normalized value
    update_df.update(updateM)
    
    #update the normalized value 
    if get_normalized_in_group:
        if not (get_mean and get_std):
            print 'Can\' get normailized score without gettting mean and std'
            return
        normal_feature = cf+'_'+nf+'_normalized'
        update_df[normal_feature] = (update_df[nf]-update_df[cf+'_'+nf+'_mean'])/update_df[cf+'_'+nf+'_std']
        update_df[normal_feature] = update_df[normal_feature].fillna(0)
    
    for f in new_feature:
        update_df[f] = update_df[f].fillna(-1)

In [139]:
#the new one not using cv-manner for the statistics
def categorical_statistics(train_df,test_df,cf,nf,\
                           get_median=True,get_min = True,get_max = True,\
                           get_normalized_in_group = True,mini_size = 20):
    statistics ={}
    statistics['mean']='mean'
    statistics['std']='std'
    statistics['size']='size'

    if get_max:
        statistics['max']='max'
    if get_min:
        statistics['min']='min'
    if get_median:
        statistics['median']='median'
        
    values = train_df.groupby(cf)[nf].agg(statistics)
    values = values.add_prefix(cf+'_'+nf+'_')
    
    new_feature = list(values.columns)
    
    #consider using -1 for others
    updateTest = test_df[[cf]].join(values, on = cf, how="left")[new_feature]#.fillna(-1)
    updateTrain = train_df[[cf]].join(values, on = cf, how="left")[new_feature]#.fillna(-1)
        
    for f in new_feature:
        if f not in test_df.columns: 
            test_df[f] = np.nan
        if f not in train_df.columns:
            train_df[f] = np.nan
    #update the statistics excluding the normalized value
    test_df.update(updateTest)
    train_df.update(updateTrain)
    
    #update the normalized value 
    if get_normalized_in_group:
        normal_feature = cf+'_'+nf+'_normalized'
        train_df.ix[train_df[cf+'_'+nf+'_'+'size']>=mini_size,normal_feature] = \
                              (train_df[nf]-train_df[cf+'_'+nf+'_mean'])/train_df[cf+'_'+nf+'_std']
        train_df.ix[train_df[cf+'_'+nf+'_'+'size']< mini_size,normal_feature] =0
        
        test_df.ix[test_df[cf+'_'+nf+'_'+'size']>=mini_size,normal_feature] = \
                              (test_df[nf]-test_df[cf+'_'+nf+'_mean'])/test_df[cf+'_'+nf+'_std']
        test_df.ix[test_df[cf+'_'+nf+'_'+'size']< mini_size,normal_feature] =0
        
        train_df[cf+'_'+nf+'_normalized']=train_df[cf+'_'+nf+'_normalized'].fillna(0)
        test_df[cf+'_'+nf+'_normalized']=test_df[cf+'_'+nf+'_normalized'].fillna(0)
        
    for f in new_feature:
        train_df[f] = train_df[f].fillna(-1)
        test_df[f] = test_df[f].fillna(-1)
        
def categorical_size(train_df,test_df,cf):
    values =train_df.groupby(cf)['interest_level'].agg({'size':'size'})
    values = values.add_prefix(cf+'_')
    new_feature = list(values.columns)
    updateTest = test_df[[cf]].join(values, on = cf, how="left")[new_feature].fillna(-1)
    updateTrain = train_df[[cf]].join(values, on = cf, how="left")[new_feature]#.fillna(-1)
    
    for f in new_feature:
        if f not in test_df.columns: 
            test_df[f] = np.nan
        if f not in train_df.columns:
            train_df[f] = np.nan
    #update the statistics excluding the normalized value
    test_df.update(updateTest)
    train_df.update(updateTrain)

In [6]:
#lodaing data
data_path = "../../kaggleData/2sigma/"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)


(49352, 15)
(74659, 14)


In [7]:
#basic numerical features
features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude", "price"]

In [8]:
#some transfromed features
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour

#some new numerical features related to the price
train_df["price_per_bath"] =  (train_df["price"]*1.0/train_df["bathrooms"]).replace(np.Inf,-1)
train_df["price_per_bed"] = (train_df["price"]*1.0/train_df["bedrooms"]).replace(np.Inf,-1)
train_df["bath_per_bed"] = (train_df["bathrooms"]*1.0/train_df["bedrooms"]).replace(np.Inf,-1)
train_df["price_per_room"] = (train_df["price"]*1.0/(train_df["bedrooms"]+train_df["bathrooms"])).replace(np.Inf,-1)


# adding all these new features to use list # "listing_id",
features_to_use.extend(["num_photos", "num_features", "num_description_words",\
                        "created_year","listing_id", "created_month", "created_day", "created_hour"])
#price new features
features_to_use.extend(["price_per_bed","bath_per_bed","price_per_room"])


In [9]:
#filter the outliers to be nan
processMap(train_df)

In [10]:
#adding the house type
train_df['house_type']=pd.Series(map(lambda x,y:(x,y),train_df['bedrooms'],train_df['bathrooms'])).apply(str)
#features_to_use.append('house_type')

In [11]:
#prepare for training
target_num_map = {'high':0, 'medium':1, 'low':2}

train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

KF=KFold(len(train_df),5,shuffle=True,random_state = 42)

In [146]:
features = list(features_to_use)
categorical = ["display_address", "street_address",'building_id','manager_id','house_type']
#categorical = ["display_address", "street_address",'building_id','manager_id']
features.extend(categorical)
features.extend(['cluster_id_10','cluster_id_30'])

"""
man_st_nf = ["bathrooms", "bedrooms","price_per_bed","bath_per_bed","price_per_room",\
                  "num_photos", "num_features", "num_description_words",'price']

clus_st_nf = ["price_per_bed","price_per_room",'price','num_features','num_photos']

mana_statistics =['mean','max','min','median','size','normalized']
cluster_statistics = ['normalized','mean','median']

for st in main_statistics:
    features.extend(map(lambda x : 'cluster_id_10_'+x+'_'+st,main_st_nf))

for st in mana_statistics:
    features.extend(map(lambda x : 'manager_id_'+x+'_'+st,man_st_nf))
    
"""

#try all the statistics
main_st_nf = ["bathrooms", "bedrooms","price_per_bed","bath_per_bed","price_per_room",\
                  "num_photos", "num_features", "num_description_words",'price']
main_statistics =['mean','max','min','median','size']
#main_statistics =['mean','max','min','median','normalized']
for st in main_statistics:
    features.extend(map(lambda x : 'cluster_id_10_'+x+'_'+st,main_st_nf))
    features.extend(map(lambda x : 'cluster_id_30_'+x+'_'+st,main_st_nf))
    features.extend(map(lambda x : 'manager_id_'+x+'_'+st,main_st_nf))
    features.extend(map(lambda x : 'house_type_'+x+'_'+st,main_st_nf)) 

#features.extend(['manager_id_size','house_type_size','cluster_id_10_size','cluster_id_30_size'])
features=list(set(features))


"""
main_st_nf = ["bathrooms", "bedrooms","price_per_bed","bath_per_bed","price_per_room",\
                  "num_photos", "num_features", "num_description_words",'price']

#mana_statisitics = ['normalized','mean']
mana_statisitics = ['mean']
cluster10_statisitics = ['mean']
cluster30_statisitics = ['normalized']
#cluster_statisitics = ['mean']
housetype_statisitics = ['normalized']

for st in main_st_nf:
    features.extend(map(lambda x : 'cluster_id_10_'+st+'_'+x,cluster10_statisitics))
    features.extend(map(lambda x : 'cluster_id_30_'+st+'_'+x,cluster30_statisitics))
    features.extend(map(lambda x : 'manager_id_'+st+'_'+x,mana_statisitics))
    #features.extend(map(lambda x : 'house_type_'+st+'_'+x,housetype_statisitics))
"""

'\nmain_st_nf = ["bathrooms", "bedrooms","price_per_bed","bath_per_bed","price_per_room",                  "num_photos", "num_features", "num_description_words",\'price\']\n\n#mana_statisitics = [\'normalized\',\'mean\']\nmana_statisitics = [\'mean\']\ncluster10_statisitics = [\'mean\']\ncluster30_statisitics = [\'normalized\']\n#cluster_statisitics = [\'mean\']\nhousetype_statisitics = [\'normalized\']\n\nfor st in main_st_nf:\n    features.extend(map(lambda x : \'cluster_id_10_\'+st+\'_\'+x,cluster10_statisitics))\n    features.extend(map(lambda x : \'cluster_id_30_\'+st+\'_\'+x,cluster30_statisitics))\n    features.extend(map(lambda x : \'manager_id_\'+st+\'_\'+x,mana_statisitics))\n    #features.extend(map(lambda x : \'house_type_\'+st+\'_\'+x,housetype_statisitics))\n'

In [None]:
#running and getting the cv from xgboost
cv_scores = []
cv_result = []

#K-FOLD already defined.If not ,use
#KF=KFold(len(train_X),5,shuffle=True,random_state = 42)
i=0
for dev_index, val_index in KF:
        result_dict = {}
        
        """some preprocessing like feature constructed in cv manners"""
        #split the orginal train set into dev_set and val_set
        dev_set, val_set = train_df.iloc[dev_index,:] , train_df.iloc[val_index,:] 
        
        getCluster(dev_set,val_set,30)
        getCluster(dev_set,val_set,10)
        
        skf=KFold(len(dev_set),5,shuffle=True,random_state = 42)
        """
        categorical_statistics(train_df,test_df,cf,nf,update_df = None)
        """
        
        
        
        
        #statitstic based on cid ,cv-manner statistics
        for f in main_st_nf:
            """
            for train,test in skf:
                categorical_statistics(dev_set.iloc[train,:],dev_set.iloc[test,:],'cluster_id_10',f\
                              ,update_df = dev_set)
                categorical_statistics(dev_set.iloc[train,:],dev_set.iloc[test,:],'manager_id',f\
                              ,update_df = dev_set)
                categorical_statistics(dev_set.iloc[train,:],dev_set.iloc[test,:],'cluster_id_30',f\
                              ,update_df = dev_set)
                categorical_statistics(dev_set.iloc[train,:],dev_set.iloc[test,:],'house_type',f\
                              ,update_df = dev_set)
            """
            categorical_statistics(dev_set,val_set,'cluster_id_10',f)
            categorical_statistics(dev_set,val_set,'manager_id',f)
            categorical_statistics(dev_set,val_set,'cluster_id_30',f)
            categorical_statistics(dev_set,val_set,'house_type',f)
            categorical_size(dev_set,val_set,'cluster_id_10')
            categorical_size(dev_set,val_set,'cluster_id_30')
            categorical_size(dev_set,val_set,'manager_id')
            categorical_size(dev_set,val_set,'house_type')
        

        """ 
         runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, \
         seed_val=0, early_stop = 20,num_rounds=10000, eta = 0.1,\
         max_depth = 6,cv_dict = None):
         """
        for f in categorical:
    
            if dev_set[f].dtype=='object':
            #print(f)
                lbl = preprocessing.LabelEncoder()
                lbl.fit(list(dev_set[f])+list(val_set[f]))
                dev_set[f] = lbl.transform(list(dev_set[f].values))
                val_set[f] = lbl.transform(list(val_set[f].values))
        
        dev_X, val_X = dev_set[features].as_matrix(), val_set[features].as_matrix()
        dev_y, val_y = train_y[dev_index], train_y[val_index]
       
        
        preds, model = runXGB(dev_X, dev_y, val_X, val_y,early_stop  = 20,\
                              feature_names = features,cv_dict = result_dict,verbose_eval=100)
       
        loss = log_loss(val_y, preds)
        cv_scores.append(loss)
        cv_result.append(result_dict)
        i+=1
        print 'loss for the turn '+str(i)+' is '+str(loss)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [97]:
val_set.columns

Index([                     u'bathrooms',                       u'bedrooms',
                          u'building_id',                        u'created',
                          u'description',                u'display_address',
                             u'features',                 u'interest_level',
                             u'latitude',                     u'listing_id',
       ...
             u'cluster_id_30_price_mean',       u'cluster_id_30_price_size',
       u'cluster_id_30_price_normalized',           u'house_type_price_std',
                 u'house_type_price_min',           u'house_type_price_max',
              u'house_type_price_median',          u'house_type_price_mean',
                u'house_type_price_size',    u'house_type_price_normalized'],
      dtype='object', length=281)

In [106]:
result_dict['train']['mlogloss'][240]

0.355509

In [142]:
#cvResult.validCurve(stop=False)
#some errors at certain turn to see the descending
cv_scores
np.mean(cv_scores)

0.5468490651899286

In [144]:
#show the importance of the features
showImportance(model,'gain')

[('building_id', 0.01723441756434148),
 ('price', 0.015783596915611892),
 ('price_per_bed', 0.01578084555607767),
 ('cluster_id_10_price_per_room_mean', 0.01571128600467027),
 ('cluster_id_30_price_median', 0.013541954797159122),
 ('bathrooms', 0.013485720956117766),
 ('cluster_id_10_bath_per_bed_mean', 0.013054920962905033),
 ('price_per_room', 0.013051249038982055),
 ('cluster_id_10_price_per_room_median', 0.013027376443609387),
 ('cluster_id_10_price_min', 0.012718386802418432),
 ('cluster_id_10_bathrooms_max', 0.012659329421988838),
 ('cluster_id_10_price_median', 0.012243149478366428),
 ('cluster_id_10_num_photos_median', 0.012224888388582503),
 ('cluster_id_30_price_per_room_median', 0.012019061745875742),
 ('manager_id_price_per_room_mean', 0.012012280000561966),
 ('cluster_id_10_price_per_bed_max', 0.011392594990925192),
 ('num_photos', 0.010635510683618305),
 ('manager_id_price_per_room_median', 0.01037268933975588),
 ('cluster_id_10_num_description_words_mean', 0.010294260898

In [60]:
title = map(lambda x:x[0],temp)
values = map(lambda x:x[1],temp)
cums = pd.Series(values).cumsum()

In [62]:
np.sum(values)

41841

In [64]:
len(values)

201

In [66]:
cums[100]

37964

In [67]:
title[100]

'cluster_id_10_price_min'

In [65]:
fig=plt.figure()
axes1 = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # main axes
axes1.bar(range(len(title)), cums)
#axes1.xticks(range(len(title)), title)
fig.set_size_inches(18.5, 10.5)
plt.show()


In [145]:
showFscore(model,False)

[('price', 1795),
 ('listing_id', 1685),
 ('price_per_room', 1624),
 ('street_address', 1364),
 ('display_address', 1256),
 ('building_id', 1246),
 ('latitude', 1152),
 ('longitude', 1152),
 ('num_description_words', 1106),
 ('price_per_bed', 940),
 ('created_day', 884),
 ('manager_id', 868),
 ('manager_id_num_features_mean', 792),
 ('num_features', 762),
 ('created_hour', 759),
 ('manager_id_price_per_room_min', 629),
 ('manager_id_num_description_words_mean', 608),
 ('manager_id_bath_per_bed_mean', 586),
 ('manager_id_num_description_words_max', 567),
 ('manager_id_num_photos_mean', 562),
 ('num_photos', 561),
 ('manager_id_price_per_room_median', 547),
 ('manager_id_num_description_words_min', 542),
 ('manager_id_bedrooms_mean', 531),
 ('manager_id_price_max', 527),
 ('manager_id_price_per_bed_median', 522),
 ('manager_id_price_per_bed_mean', 508),
 ('manager_id_price_min', 503),
 ('manager_id_price_per_room_max', 492),
 ('manager_id_price_median', 481),
 ('manager_id_bathrooms_mean

In [69]:
temp  = pd.DataFrame(showFscore(model,False))

In [71]:
temp.columns = ['feature','times']

In [74]:
temp = temp.set_index('feature')

In [87]:
temp.filter(like = 'house_type',axis = 0)

Unnamed: 0_level_0,times
feature,Unnamed: 1_level_1
house_type_num_description_words_normalized,516
house_type_price_per_bed_normalized,429
house_type_price_normalized,400
house_type_num_features_normalized,395
house_type_price_per_room_normalized,381
house_type_num_photos_normalized,274
house_type_bath_per_bed_normalized,176
house_type_bedrooms_normalized,166
house_type_bathrooms_normalized,125
house_type_num_features_mean,107
