# Predictive Model

In [1]:
import pandas as pd
import numpy as np

#load data
house_rent = pd.read_csv('house_rent.csv', header = 0, na_values = [' ','NA','NAN','None'])

In [2]:
#check data structure
print(house_rent.shape)
print(house_rent.count())

(221947, 15)
Unnamed: 0              221946
unique_id               221946
bathrooms               221946
bedrooms                221946
city                    221921
list_price              221946
latitude                221946
longitude               221946
property_type           221946
lot_sqft                221946
sqft                    221946
state                   221946
year_built              221946
zip                     221946
rentzestimate_amount    221947
dtype: int64


In [3]:
#check NA
house_rent.loc[house_rent['year_built'].isnull()==True]

Unnamed: 0.1,Unnamed: 0,unique_id,bathrooms,bedrooms,city,list_price,latitude,longitude,property_type,lot_sqft,sqft,state,year_built,zip,rentzestimate_amount
209167,,,,,,,,,,,,,,,2700


In [4]:
#remove NA
house_rent = house_rent.loc[house_rent['year_built'].isnull()== False]
#check data type
print(house_rent.dtypes)

Unnamed: 0              float64
unique_id                object
bathrooms               float64
bedrooms                float64
city                     object
list_price              float64
latitude                float64
longitude               float64
property_type            object
lot_sqft                float64
sqft                    float64
state                    object
year_built              float64
zip                     float64
rentzestimate_amount      int64
dtype: object


In [5]:
#change zip code data type from float to string
house_rent.zip = house_rent.zip.astype(str)
#change year_built data type from float to int
house_rent.year_built = house_rent.year_built.astype(np.int64)
#create two variables: year_from_now and log_rent(the rent is right-skewed, log transformation on rent)
import datetime
house_rent['year_from_now'] = datetime.date.today().year - house_rent['year_built']
house_rent['log_rent'] = np.log(house_rent['rentzestimate_amount'])

In [6]:
#categorical variables: create dummies(state and property type)
#zip and city have too many levels, so I don't include them in the prediction
state = pd.get_dummies(house_rent['state']).add_suffix('_state')
property_type = pd.get_dummies(house_rent['property_type']).add_suffix('_type')
house_rent_dummy = pd.concat([house_rent,state,property_type],axis = 1)

In [7]:
#use a random sample(0.5%) to find the best model first and only select useful attributes
property_price = house_rent_dummy.loc[:,~house_rent_dummy.columns.isin(['Unnamed: 0','unique_id','city','property_type',
                                                                        'state','year_built','zip','rentzestimate_amount'
                                                                       ])].sample(frac = 0.005)
print(property_price.shape)
property_price.head()

(1110, 17)


Unnamed: 0,bathrooms,bedrooms,list_price,latitude,longitude,lot_sqft,sqft,year_from_now,log_rent,CA_state,GA_state,TX_state,APT_type,COND_type,MULT_type,RESI_type,Timeshare_type
193765,3.0,4.0,434900.0,30.192267,-97.893233,9200.0,2918.0,28,7.803843,0,0,1,0,0,0,1,0
217676,3.0,3.0,565000.0,30.197375,-97.832206,9200.0,1800.0,0,7.536364,0,0,1,0,0,0,1,0
215854,3.0,4.0,1300000.0,30.70524,-97.166962,144.0,4086.0,10,7.901007,0,0,1,0,0,0,1,0
138472,3.0,4.0,578000.0,30.465448,-98.033917,9200.0,3493.0,8,7.950855,0,0,1,0,0,0,1,0
186266,3.0,4.0,279500.0,33.818432,-116.462807,8276.0,1428.0,23,7.438384,1,0,0,0,0,0,1,0


In [8]:
#define features dataset(x) and target vector(y)
target = property_price.loc[:,property_price.columns.isin(['log_rent'])]
print('target:%s' %target.head())

features = property_price.loc[:,~property_price.columns.isin(['log_rent'])]
print('features:%s'%features.head())

target:        log_rent
193765  7.803843
217676  7.536364
215854  7.901007
138472  7.950855
186266  7.438384
features:        bathrooms  bedrooms  list_price   latitude   longitude  lot_sqft  \
193765        3.0       4.0    434900.0  30.192267  -97.893233    9200.0   
217676        3.0       3.0    565000.0  30.197375  -97.832206    9200.0   
215854        3.0       4.0   1300000.0  30.705240  -97.166962     144.0   
138472        3.0       4.0    578000.0  30.465448  -98.033917    9200.0   
186266        3.0       4.0    279500.0  33.818432 -116.462807    8276.0   

          sqft  year_from_now  CA_state  GA_state  TX_state  APT_type  \
193765  2918.0             28         0         0         1         0   
217676  1800.0              0         0         0         1         0   
215854  4086.0             10         0         0         1         0   
138472  3493.0              8         0         0         1         0   
186266  1428.0             23         1         0         0 

In [9]:
from sklearn import linear_model,metrics,preprocessing,svm,cross_validation
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.cross_validation import train_test_split,cross_val_score,KFold
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor



In [10]:
#min_max normalization for each feature
min_max_norm = preprocessing.MinMaxScaler()
feature_norm = min_max_norm.fit_transform(features)
target_array = np.squeeze(target.values)

In [11]:
#split training dataset and testing dataset
X_train, X_test, y_train, y_test = train_test_split(
    feature_norm, target_array, test_size=0.7, random_state=0)

In [13]:
#run regression models and calculate mse(mean squared error),rmse(root mean square error)and mae(mean absolute error)
#rmse on cross-validation is used for model selection, and rmse on testing set shows the model generalization performance
for name,met in [
        ('neural network',MLPRegressor(hidden_layer_sizes=(30,30,30), activation='logistic', 
                                       solver='lbfgs', alpha=0.001)),
        ('regression tree',DecisionTreeRegressor(max_depth=20, min_samples_leaf= 5, min_samples_split= 5)),
        ('linear regression', LinearRegression()),
        ('ridge regression', Ridge(fit_intercept=True, alpha=0.3)),
        ('lasso regression', Lasso(fit_intercept=True, alpha=0.3)),
        ('elastic-net regularization', ElasticNet(fit_intercept=True, alpha=0.3)),
        ('KNN', KNeighborsRegressor(n_neighbors=2,metric = 'euclidean')),
        ('random forests regressor', RandomForestRegressor(n_estimators=100, criterion='mse', max_depth= 10, 
                            min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                            max_features='auto', random_state=0)),
        ('extra trees regressor', ExtraTreesRegressor(n_estimators=100, criterion='mse', max_depth=10, 
                                                      min_samples_split=2, min_samples_leaf=1, 
                                                      min_weight_fraction_leaf=0.0, random_state=0)),
        ('gradient boosting regressor', GradientBoostingRegressor(loss='ls', alpha=0.95,
                                n_estimators=100, max_depth=10,
                                learning_rate=.1)),
        ('adaBoost regressor', AdaBoostRegressor(DecisionTreeRegressor(max_depth=10, min_samples_leaf= 5, min_samples_split= 5), 
                                                n_estimators=100, learning_rate=0.1, loss='linear', random_state=0)),
        ('bagging regressor',BaggingRegressor(DecisionTreeRegressor(max_depth=20, min_samples_leaf= 5, min_samples_split= 5),
                                              n_estimators=100,max_samples=1.0, max_features=1.0, random_state=0))
        ]:
    #cross validation to calculate rmse,mse,mae for model selection
    mse_cv = -cross_validation.cross_val_score(met, X_train, y_train, scoring='neg_mean_squared_error', cv=5,).mean()
    mae_cv = -cross_validation.cross_val_score(met, X_train, y_train, scoring='neg_mean_absolute_error', cv=5,).mean()
    rmse_cv = np.sqrt(abs(mse_cv))
    #generalizaton performance of the model on testing set
    met.fit(X_train, y_train)
    y_pred = met.predict(X_test)
    mse_test = metrics.mean_squared_error(y_test, y_pred)
    mae_test = metrics.mean_absolute_error(y_test, y_pred)
    rmse_test = np.sqrt(mse_test)
  
    print('Method: %s' %name)
    print('RMSE on testing: %.4f' %rmse_test)
    print('RMSE on CV: %.4f' %rmse_cv)
    print('MSE on testing: %.4f' %mse_test)
    print('MSE on CV: %.4f' %mse_cv)
    print('MAE on testing: %.4f' %mae_test)
    print('MAE on CV: %.4f' %mae_cv)
    print("\n")

Method: neural network
RMSE on testing: 0.3397
RMSE on CV: 0.3032
MSE on testing: 0.1154
MSE on CV: 0.0919
MAE on testing: 0.2294
MAE on CV: 0.2130


Method: regression tree
RMSE on testing: 0.3306
RMSE on CV: 0.3065
MSE on testing: 0.1093
MSE on CV: 0.0939
MAE on testing: 0.1977
MAE on CV: 0.1999


Method: linear regression
RMSE on testing: 0.4495
RMSE on CV: 0.3325
MSE on testing: 0.2020
MSE on CV: 0.1106
MAE on testing: 0.2652
MAE on CV: 0.2501


Method: ridge regression
RMSE on testing: 0.4105
RMSE on CV: 0.3298
MSE on testing: 0.1685
MSE on CV: 0.1088
MAE on testing: 0.2692
MAE on CV: 0.2537


Method: lasso regression
RMSE on testing: 0.6049
RMSE on CV: 0.6504
MSE on testing: 0.3659
MSE on CV: 0.4230
MAE on testing: 0.4427
MAE on CV: 0.4831


Method: elastic-net regularization
RMSE on testing: 0.6049
RMSE on CV: 0.6504
MSE on testing: 0.3659
MSE on CV: 0.4230
MAE on testing: 0.4427
MAE on CV: 0.4831


Method: KNN
RMSE on testing: 0.3796
RMSE on CV: 0.3663
MSE on testing: 0.1441
MS

With different samples, the performances of random forests and extra trees are actually very similar in terms of RMSE on cross validation. Here we pick random forest regressor and extra trees regressor given their smallest RMSE on cross validation, so now fit the model on the whole dataset to see its generalization performance.

In [14]:
#Random Forest
#select useful attributes
property_price_whole = house_rent_dummy.loc[:,~house_rent_dummy.columns.isin(['Unnamed: 0','unique_id','city','property_type',
                                                                        'state','year_built','zip','rentzestimate_amount'
                                                                       ])]
#define x and y
target = property_price_whole.loc[:,property_price_whole.columns.isin(['log_rent'])]

features = property_price_whole.loc[:,~property_price_whole.columns.isin(['log_rent'])]

#normalization
min_max_norm = preprocessing.MinMaxScaler()
feature_norm = min_max_norm.fit_transform(features)
target_array = np.squeeze(target.values)

#split traing and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    feature_norm, target_array, test_size=0.7, random_state=0)

#fit random forests model on training set
RF = RandomForestRegressor(n_estimators=100, criterion='mse', max_depth= 10, 
                            min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                            max_features='auto', random_state=0)

#use cross-validation to calculate rmse,mse,mae      
mse_cv = -cross_validation.cross_val_score(RF, X_train, y_train, scoring='neg_mean_squared_error', cv=5,).mean()
mae_cv = -cross_validation.cross_val_score(RF, X_train, y_train, scoring='neg_mean_absolute_error', cv=5,).mean()
rmse_cv = np.sqrt(abs(mse_cv))

#generalization performance of the model
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)
mse_test = metrics.mean_squared_error(y_test, y_pred)
mae_test = metrics.mean_absolute_error(y_test, y_pred)
rmse_test = np.sqrt(mse_test)
  
print('RMSE on testing: %.4f' %rmse_test)
print('RMSE on CV: %.4f' %rmse_cv)
print('MSE on testing: %.4f' %mse_test)
print('MSE on CV: %.4f' %mse_cv)
print('MAE on testing: %.4f' %mae_test)
print('MAE on CV: %.4f' %mae_cv)


RMSE on testing: 0.1121
RMSE on CV: 0.1128
MSE on testing: 0.0126
MSE on CV: 0.0127
MAE on testing: 0.0642
MAE on CV: 0.0645


In [15]:
#Extra Trees
#fit extra trees model on training set
ET = ExtraTreesRegressor(n_estimators=100, criterion='mse', max_depth=10, 
                                                      min_samples_split=2, min_samples_leaf=1, 
                                                      min_weight_fraction_leaf=0.0, random_state=0)

#use cross-validation to calculate rmse,mse,mae      
mse_cv = -cross_validation.cross_val_score(ET, X_train, y_train, scoring='neg_mean_squared_error', cv=5,).mean()
mae_cv = -cross_validation.cross_val_score(ET, X_train, y_train, scoring='neg_mean_absolute_error', cv=5,).mean()
rmse_cv = np.sqrt(abs(mse_cv))

#generalization performance of the model
ET.fit(X_train, y_train)
y_pred = ET.predict(X_test)
mse_test = metrics.mean_squared_error(y_test, y_pred)
mae_test = metrics.mean_absolute_error(y_test, y_pred)
rmse_test = np.sqrt(mse_test)
  
print('RMSE on testing: %.4f' %rmse_test)
print('RMSE on CV: %.4f' %rmse_cv)
print('MSE on testing: %.4f' %mse_test)
print('MSE on CV: %.4f' %mse_cv)
print('MAE on testing: %.4f' %mae_test)
print('MAE on CV: %.4f' %mae_cv)

RMSE on testing: 0.1663
RMSE on CV: 0.1668
MSE on testing: 0.0277
MSE on CV: 0.0278
MAE on testing: 0.1251
MAE on CV: 0.1253


Random Forest is better when working on the whole dataset. Since mean absolute error on testing set of random forest is 0.0642, which means log(rent_prediction)-log(rent_true) = +/-0.0642, after transformation, rent_prediction/rent_true = $e^{+/-0.0642}$ = 1.066/0.938, so the predicted rent is either 6.7% more than the actual rent or 6.2% less than the actual rent.

In [16]:
#give column names
X_train = pd.DataFrame(X_train)
X_train.columns = features.columns.values

In [17]:
X_train.columns

Index(['bathrooms', 'bedrooms', 'list_price', 'latitude', 'longitude',
       'lot_sqft', 'sqft', 'year_from_now', 'CA_state', 'GA_state', 'TX_state',
       'APT_type', 'COND_type', 'MULT_type', 'RESI_type', 'Timeshare_type'],
      dtype='object')

In [18]:
#check importance of features
importances = RF.feature_importances_
std = np.std([tree.feature_importances_ for tree in RF.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print("%d.  %15s  (%f)" % (f + 1, X_train.columns[indices[f]], importances[indices[f]]))

Feature ranking:
1.       list_price  (0.804951)
2.        longitude  (0.061654)
3.         latitude  (0.040695)
4.             sqft  (0.036624)
5.    year_from_now  (0.021375)
6.        bathrooms  (0.016834)
7.         lot_sqft  (0.008896)
8.         bedrooms  (0.003687)
9.        RESI_type  (0.002418)
10.         APT_type  (0.001947)
11.        MULT_type  (0.000389)
12.         TX_state  (0.000215)
13.         CA_state  (0.000159)
14.        COND_type  (0.000128)
15.         GA_state  (0.000029)
16.   Timeshare_type  (0.000000)
