#GOALS#

model_create
1. Connect to DataBase
2. Query for data
3. Read into DataFrame
    1. Keep only rows with data that will be used
    2. DateTime
    3. As-Is/Scale/Dummy
    4. Fill NaNs
    5. Shuffle
    6. Collect features that will be used to train the model
4. Train model

app
1. Accept address as input
2. Gather address data from DataFrame in step #3 of model_create
3. Adjust year and month to current date
4. model.predict

In [1]:
# Database connect
import psycopg2

# Numpy arrays
import numpy as np

# DataFrames
import pandas as pd
pd.set_option('display.max_columns', 500)

# Plotting
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

# Models
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

# Cross validation
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, make_scorer
scorer_ = make_scorer(median_absolute_error)

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

# Print Function
from pprint import pprint



model_create goal #1

In [90]:
db_info = {
    'user':'app_connect',
    'database':'real_estate_data',
    'host' : 'localhost',
    'port' : 5432,
    'password' : 'flying_horse536'
}
conn = psycopg2.connect(**db_info)
cur = conn.cursor()

model_create goal #2

In [91]:
def execute_query(q):
    ''' Execute an SQL query string (q) on the current connected database.
    '''
    conn.rollback()
    cur.execute(q)
    conn.commit()
    return cur.fetchall(), cur.description

In [92]:
query = '''
    SELECT *
    FROM sales_info s
    LEFT JOIN property_info p
    ON s.major = p.major AND s.minor = p.minor
    LEFT JOIN unit_breakdown u
    ON s.major = u.major and s.minor = u.minor
    ;
'''

model_create goal #3

In [93]:
# Get query result and description from the execute_query function
query_result, description = execute_query(query)
# description is a dict, with 'name' as one of the keys. Grab all names from the columns
column_names = [d.name for d in description]

In [94]:
# Read data from query result into a Pandas DataFrame. Column names are gathered from the description
df = pd.DataFrame(query_result, columns=column_names)

model_create goal #3.B

In [95]:
# Change 'DocumentDate' to DateTime
df['document_date'] = pd.to_datetime(df['document_date'])
# Get SaleYear and SaleMonth columns out of the DateTime object
df['sale_year'] = pd.DatetimeIndex(df['document_date']).year
#df['sale_year'] -= df['sale_year'].min()
df['sale_month'] = pd.DatetimeIndex(df['document_date']).month

model_create goal #3.A

In [96]:
def filter_df(keep_values, col_name, df):
    '''Filters out rows from df[col_name] that are NOT in keep_values
    '''
    mask = df[col_name].isin(keep_values)
    return df[mask]

In [97]:
'''
PrincipalUse value of 6 represents Residential buildings
Keep only sales with PrincipalUse value of 6
'''
principal_use_keep_values = [6]
df = filter_df(keep_values=principal_use_keep_values, col_name='principal_use', df=df)

'''
PropertyType value of 1 represents Land Only
PropertyType value of 2 represents Land with New Building
PropertyType value of 3 represents Land with Previously Used Building
Keep only sales with PropertyType value in [1, 2, 3]
'''
property_type_keep_values = [1, 2, 3]
df = filter_df(property_type_keep_values, 'property_type', df)

'''
SaleInstrument value of 3 represents a Statutory Warranty Deed
By using this deed, the seller promises the buyer 
1. The seller is the owner of the property and has the right to sell it
2. No one else is possessing the property
3. There are no encumbrances against the property
4. No one with a better claim to the property will interfere with the transferee’s rights
5. The seller will defend certain claims regarding title to the property
'''
sale_instrument_keep_values = [3]
df = filter_df(sale_instrument_keep_values, 'sale_instrument', df)

'''
Remove years before 1995
'''
df = df[df['sale_year'] > 1994]

# Testing specific years
#df = df[df['sale_year'] > 2011]

In [80]:
df.sale_year.value_counts().reset_index().sort_values(by='index')

Unnamed: 0,index,sale_year
21,1995,15676
14,1996,27262
10,1997,31499
4,1998,33969
7,1999,32825
9,2000,31893
11,2001,31094
5,2002,33642
3,2003,39869
1,2004,44273


model_create goal #3.D/E

In [98]:
# Shuffle dataframe
df = shuffle(df)
# Fill nans
df.fillna(0, inplace=True)

In [82]:
def extract_features_from_df(df):
    # Setting up features for model:

    # Features:
    sales_feature = [
        'property_type',
        'sale_reason',
        'property_class',
        'sale_year',
        'sale_month'
    ]

    property_features = [
        'nbr_living_units', 
        'stories',
        'bldg_grade',
        'bldg_grade_var', 
        'sq_ft_1st_floor',
        'sq_ft_half_floor',
        'sq_ft_2nd_floor',
        'sq_ft_upper_floor',
        'sq_ft_unfin_full',
        'sq_ft_unfin_half',
        'sq_ft_tot_living',
        'sq_ft_tot_basement',
        'sq_ft_fin_basement',
        'fin_basement_grade',
        'sq_ft_garage_basement',
        'sq_ft_garage_attached'
    ]

    combined_features = sales_feature + property_features

    features_as_is = [
        'sale_year',
        'sale_month'
    ]

    features_to_scale = [
        'sq_ft_1st_floor',
        'sq_ft_half_floor',
        'sq_ft_2nd_floor',
        'sq_ft_upper_floor',
        'sq_ft_unfin_full',
        'sq_ft_unfin_half',
        'sq_ft_tot_living',
        'sq_ft_tot_basement',
        'sq_ft_fin_basement',
        'sq_ft_garage_basement',
        'sq_ft_garage_attached'
    ]

    dummy_features = [
        'property_type',
        'sale_reason',
        'property_class',
        'nbr_living_units', 
        'stories',
        'bldg_grade',
        'bldg_grade_var', 
    ]


    # Standardize the dataframe
    scalar = StandardScaler().fit(df[features_to_scale])
    #df[features_to_scale] = scalar.transform(df[features_to_scale])
    scalar.transform(df[features_to_scale])

    
    final_df = df[features_as_is + features_to_scale + ['sale_price']]

    # Get dummy cols
    dummies = pd.get_dummies(df[dummy_features].applymap(str))
    final_df = pd.concat([final_df, dummies], axis=1)

    final_df = shuffle(final_df)
    
    # Target value: SalePrice
    y = final_df.pop('sale_price')
    X = final_df
    
    return X, y

In [None]:
Major
Minor
UnitTypeItemId
NbrThisType
SqFt
NbrBedrooms
NbrBaths


In [107]:
df.nbr_baths.value_counts()

0.00     653616
1.00       2843
2.00        533
1.50        222
2.50        118
0.75        116
1.75         86
3.00         24
2.25         12
3.50         11
0.25         10
0.50          9
4.00          8
2.75          6
3.75          3
22.00         3
20.00         2
39.00         2
0.10          1
8.00          1
34.00         1
23.00         1
7.00          1
Name: nbr_baths, dtype: int64

model_create goal #3.F

In [116]:
# Setting up features for model:

features_as_is = [
    'sale_year',
    'sale_month',
    'yr_built',
    'yr_renovated'
]

features_to_scale = [
    'sq_ft_1st_floor',
    'sq_ft_half_floor',
    'sq_ft_2nd_floor',
    'sq_ft_upper_floor',
    'sq_ft_unfin_full',
    'sq_ft_unfin_half',
    'sq_ft_tot_living',
    'sq_ft_tot_basement',
    'sq_ft_fin_basement',
    'sq_ft_garage_basement',
    'sq_ft_garage_attached',
    'sq_ft_open_porch',
    'sq_ft_enclosed_porch',
    'sq_ft_deck',
    'brick_stone',
    'pcnt_complete',
    'pcnt_net_condition',
    'addnl_cost',
    'nbr_this_type',
    'sq_ft'
]


dummy_features = [
    'bedrooms',
    'property_type',
    'sale_reason',
    'property_class',
    'nbr_living_units', 
    'stories',
    'bldg_grade',
    'bldg_grade_var',
    'daylight_basement',
    'heat_system',
    'heat_source',
    'bath_half_count',
    'bath_3qtr_count',
    'bath_full_count',
    'view_utilization',
    'fp_single_story',
    'fp_multi_story',
    'fp_freestanding',
    'fp_additional',
    'condition',
    'unit_type_item_id',
    'nbr_bedrooms',
    'nbr_baths'
]


# Standardize the dataframe
scalar = StandardScaler().fit(df[features_to_scale])
# df[features_to_scale] = scalar.transform(df[features_to_scale])
scalar.transform(df[features_to_scale])

final_df = df[features_as_is + features_to_scale + ['sale_price'] + ['address']]

# Get dummy cols
dummies = pd.get_dummies(df[dummy_features].applymap(str))
final_df = pd.concat([final_df, dummies], axis=1)

final_df = shuffle(final_df)

# Target value: SalePrice
y = final_df.pop('sale_price')

X = final_df.copy()
del X['address']

In [28]:
final_df.iloc[0:1]

Unnamed: 0,sale_year,sale_month,sq_ft_1st_floor,sq_ft_half_floor,sq_ft_2nd_floor,sq_ft_upper_floor,sq_ft_unfin_full,sq_ft_unfin_half,sq_ft_tot_living,sq_ft_tot_basement,sq_ft_fin_basement,sq_ft_garage_basement,sq_ft_garage_attached,address,property_type_1,property_type_2,property_type_3,sale_reason_0,sale_reason_1,sale_reason_10,sale_reason_12,sale_reason_13,sale_reason_14,sale_reason_16,sale_reason_17,sale_reason_18,sale_reason_2,sale_reason_4,sale_reason_5,sale_reason_6,sale_reason_7,sale_reason_8,property_class_0,property_class_1,property_class_2,property_class_3,property_class_4,property_class_7,property_class_8,property_class_9,nbr_living_units_0.0,nbr_living_units_1.0,nbr_living_units_2.0,nbr_living_units_3.0,stories_0.0,stories_1.0,stories_1.5,stories_2.0,stories_2.5,stories_3.0,stories_3.5,bldg_grade_0.0,bldg_grade_10.0,bldg_grade_11.0,bldg_grade_12.0,bldg_grade_13.0,bldg_grade_3.0,bldg_grade_4.0,bldg_grade_5.0,bldg_grade_6.0,bldg_grade_7.0,bldg_grade_8.0,bldg_grade_9.0,bldg_grade_var_0.0,bldg_grade_var_99.0
3077,2013,1,0.398709,-0.277461,1.241092,-0.117672,-0.039925,-0.038463,1.508977,1.680274,1.474555,2.221951,-0.936462,191 36TH AVE E,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0


In [42]:
df[pd.isnull(df.address)]

Unnamed: 0,index,excise_tax_nbr,major,minor,document_date,sale_price,recording_nbr,volume,page,plat_nbr,plat_type,plat_lot,plat_block,seller_name,buyer_name,property_type,principal_use,sale_instrument,af_forest_land,af_current_use_land,af_non_profit_use,af_historic_property,sale_reason,property_class,sale_warning,index.1,major.1,minor.1,bldg_nbr,nbr_living_units,address,building_number,fraction,direction_prefix,street_name,street_type,direction_suffix,zip_code,stories,bldg_grade,bldg_grade_var,sq_ft_1st_floor,sq_ft_half_floor,sq_ft_2nd_floor,sq_ft_upper_floor,sq_ft_unfin_full,sq_ft_unfin_half,sq_ft_tot_living,sq_ft_tot_basement,sq_ft_fin_basement,fin_basement_grade,sq_ft_garage_basement,sq_ft_garage_attached,daylight_basement,sq_ft_open_porch,sq_ft_enclosed_porch,sq_ft_deck,heat_system,heat_source,brick_stone,view_utilization,bedrooms,bath_half_count,bath_3qtr_count,bath_full_count,fp_single_story,fp_multi_story,fp_freestanding,fp_additional,yr_built,yr_renovated,pcnt_complete,bbsolescence,pcnt_net_condition,condition,addnl_cost,sale_year,sale_month


In [67]:
X_351

Unnamed: 0,sale_year,sale_month,sq_ft_1st_floor,sq_ft_half_floor,sq_ft_2nd_floor,sq_ft_upper_floor,sq_ft_unfin_full,sq_ft_unfin_half,sq_ft_tot_living,sq_ft_tot_basement,sq_ft_fin_basement,sq_ft_garage_basement,sq_ft_garage_attached,property_type_1,property_type_2,property_type_3,sale_reason_0,sale_reason_1,sale_reason_10,sale_reason_11,sale_reason_12,sale_reason_13,sale_reason_14,sale_reason_15,sale_reason_16,sale_reason_17,sale_reason_18,sale_reason_19,sale_reason_2,sale_reason_3,sale_reason_4,sale_reason_5,sale_reason_6,sale_reason_7,sale_reason_8,sale_reason_9,property_class_0,property_class_1,property_class_2,property_class_3,property_class_4,property_class_5,property_class_6,property_class_7,property_class_8,property_class_9,nbr_living_units_0.0,nbr_living_units_1.0,nbr_living_units_2.0,nbr_living_units_20.0,nbr_living_units_3.0,nbr_living_units_4.0,nbr_living_units_5.0,stories_0.0,stories_1.0,stories_1.5,stories_2.0,stories_2.5,stories_3.0,stories_3.5,stories_4.0,bldg_grade_0.0,bldg_grade_1.0,bldg_grade_10.0,bldg_grade_11.0,bldg_grade_12.0,bldg_grade_13.0,bldg_grade_2.0,bldg_grade_20.0,bldg_grade_3.0,bldg_grade_4.0,bldg_grade_5.0,bldg_grade_6.0,bldg_grade_7.0,bldg_grade_8.0,bldg_grade_9.0,bldg_grade_var_0.0,bldg_grade_var_20.0,bldg_grade_var_25.0,bldg_grade_var_30.0,bldg_grade_var_60.0,bldg_grade_var_99.0
296798,1998,5,-0.140723,-0.263228,-0.78104,-0.125694,-0.030817,-0.030451,-0.04222,1.104625,1.357304,-0.415261,-0.921058,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0


In [121]:
X_351 = final_df[final_df['address'].str.contains('351  N 137', na=False)]
del X_351['address']
# X_351['sale_year'] = 1998
model_XG.predict(X_351)

array([ 236956.3125], dtype=float32)

In [124]:
X_351 = final_df[final_df['address'].str.contains('351  N 137', na=False)]
del X_351['address']
X_351['sale_year'] = 2017
X_351['sale_month'] = 7
model_XG.predict(X_351)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


array([ 647066.3125], dtype=float32)

In [113]:
def print_scores(model, X_test, y_test):
    y_hat = model.predict(X_test)
    print("{}".format(model.__class__.__name__))
    print("R2 score: {:.3f}".format(
            model.score(X_test, y_test)))
    print("Mean Squared Error score: {:.3f}".format(
            np.sqrt(mean_squared_error(y_test, y_hat))))
    print("Mean Absolute Error score: {:.3f}".format(
            mean_absolute_error(y_test, y_hat)))
    print("Median absolute error: {:.3f}".format(
            median_absolute_error(y_hat, y_test)))
    

In [117]:
# Train Test Split Score
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [118]:
# Linear Regression
# All Data
model_LR = LinearRegression()
model_LR.fit(X_train, y_train)
print_scores(model_LR, X_test, y_test)

LinearRegression
R2 score: 0.098
Mean Squared Error score: 1384201.457
Mean Absolute Error score: 377841.823
Median absolute error: 150942.755


In [198]:
# 5 years
model_LR = LinearRegression()
model_LR.fit(X_train, y_train)
print_scores(model_LR, X_test, y_test)

LinearRegression
R2 score: 0.114
Mean Squared Error score: 1286019.202
Mean Absolute Error score: 337399.347
Median absolute error: 144548.799


In [200]:
# Random Forest
# 5 years
model_RF = RandomForestRegressor(n_jobs=-1, n_estimators=250)
model_RF.fit(X_train, y_train)
print_scores(model_RF, X_test, y_test)

RandomForestRegressor
R2 score: 0.665
Mean Squared Error score: 791132.957
Mean Absolute Error score: 227592.219
Median absolute error: 100587.166


In [None]:
# 5 all data
model_RF = RandomForestRegressor(n_jobs=-1, n_estimators=250)
model_RF.fit(X_train, y_train)
print_scores(model_RF, X_test, y_test)

RandomForestRegressor
R2 score: 0.667
Mean Squared Error score: 834058.267
Mean Absolute Error score: 178046.207
Median absolute error: 58373.112


In [None]:
# Gradient Booster
# all data
model_GB = GradientBoostingRegressor(learning_rate=0.1, n_estimators=500, max_depth=15)
model_GB.fit(X_train, y_train)
print_scores(model_GB, X_test, y_test)

In [201]:
# 5 years
model_GB = GradientBoostingRegressor(learning_rate=0.01, n_estimators=500)
model_GB.fit(X_train, y_train)
print_scores(model_GB, X_test, y_test)

GradientBoostingRegressor
R2 score: 0.346
Mean Squared Error score: 1104915.470
Mean Absolute Error score: 312529.687
Median absolute error: 138340.026


In [16]:
# XGBoost
# all data
model_XG = XGBRegressor(learning_rate=0.1, n_estimators=500, max_depth=15)
model_XG.fit(X_train, y_train)
print_scores(model_XG, X_test, y_test)

XGBRegressor
R2 score: 0.630
Mean Squared Error score: 907677.636
Mean Absolute Error score: 176873.642
Median absolute error: 56380.766


In [220]:
# 5 years
model_XG = XGBRegressor(learning_rate=0.1, n_estimators=500)
model_XG.fit(X_train, y_train)
print_scores(model_XG, X_test, y_test)

XGBRegressor
R2 score: 0.470
Mean Squared Error score: 994781.830
Mean Absolute Error score: 288562.080
Median absolute error: 119413.695


In [228]:
model_XG = XGBRegressor(learning_rate=0.1, n_estimators=750, max_depth=15)
model_XG.fit(X_train, y_train)
print_scores(model_XG, X_test, y_test)

XGBRegressor
R2 score: 0.660
Mean Squared Error score: 797269.744
Mean Absolute Error score: 222701.515
Median absolute error: 94690.203


In [229]:
model_XG = XGBRegressor(learning_rate=0.1, n_estimators=2500, max_depth=15)
model_XG.fit(X_train, y_train)
print_scores(model_XG, X_test, y_test)

XGBRegressor
R2 score: 0.659
Mean Squared Error score: 797746.371
Mean Absolute Error score: 224082.289
Median absolute error: 96062.859


In [234]:
model_XG = XGBRegressor(learning_rate=0.01, n_estimators=2500, max_depth=15)
model_XG.fit(X_train, y_train)
print_scores(model_XG, X_test, y_test)

XGBRegressor
R2 score: 0.662
Mean Squared Error score: 794582.942
Mean Absolute Error score: 220364.935
Median absolute error: 94968.031


In [231]:
model_XG = XGBRegressor(learning_rate=0.1, n_estimators=400, max_depth=15)
model_XG.fit(X_train, y_train)
print_scores(model_XG, X_test, y_test)

XGBRegressor
R2 score: 0.661
Mean Squared Error score: 796104.857
Mean Absolute Error score: 221147.556
Median absolute error: 94109.656


In [232]:
model_XG = XGBRegressor(learning_rate=0.1, n_estimators=850, max_depth=15)
model_XG.fit(X_train, y_train)
print_scores(model_XG, X_test, y_test)

XGBRegressor
R2 score: 0.660
Mean Squared Error score: 797063.149
Mean Absolute Error score: 222278.364
Median absolute error: 94144.086


In [233]:
model_XG = XGBRegressor(learning_rate=0.1, n_estimators=650, max_depth=15)
model_XG.fit(X_train, y_train)
print_scores(model_XG, X_test, y_test)

XGBRegressor
R2 score: 0.660
Mean Squared Error score: 796780.867
Mean Absolute Error score: 221739.404
Median absolute error: 93938.781


In [202]:
#### Cross Val Score
model_cv_RF = RandomForestRegressor(n_jobs=-1)
print(cross_val_score(model_cv_RF, X, y, cv=3, scoring=scorer_).mean())

106063.083333


In [203]:
model_cv_GB = GradientBoostingRegressor(learning_rate=0.01, n_estimators=500)
print(cross_val_score(model_cv_GB, X, y, cv=3, scoring=scorer_).mean())

139275.21097


In [None]:
model_cv_XG = XGBRegressor(learning_rate=0.01)


In [36]:
model_LR = LinearRegression()
model_LR.fit(X_train, y_train)
print_scores(model_LR, X_test, y_test)

LinearRegression
R2 score: 0.091
Mean Squared Error score: 1412014.417
Mean Absolute Error score: 369285.069
Median absolute error: 137303.354


In [16]:
model_XG = XGBRegressor(learning_rate=0.1, n_estimators=500, max_depth=15)
model_XG.fit(X_train, y_train)
print_scores(model_XG, X_test, y_test)

XGBRegressor
R2 score: 0.648
Mean Squared Error score: 878149.636
Mean Absolute Error score: 174206.849
Median absolute error: 56420.531


In [102]:
# Monday testing of XGBoost
model_XG = XGBRegressor(learning_rate=0.1, n_estimators=500, max_depth=15)
model_XG.fit(X_train, y_train)
print_scores(model_XG, X_test, y_test)

XGBRegressor
R2 score: 0.643
Mean Squared Error score: 901539.971
Mean Absolute Error score: 164396.411
Median absolute error: 47795.047


In [119]:
# Monday testing of XGBoost
# Adding in latest data
model_XG = XGBRegressor(learning_rate=0.1, n_estimators=500, max_depth=15)
model_XG.fit(X_train, y_train)
print_scores(model_XG, X_test, y_test)

XGBRegressor
R2 score: 0.622
Mean Squared Error score: 896446.090
Mean Absolute Error score: 164177.424
Median absolute error: 48143.703


In [103]:
model_XG2 = XGBRegressor(learning_rate=0.1, n_estimators=700, max_depth=8)
model_XG2.fit(X_train, y_train)
print_scores(model_XG2, X_test, y_test)

XGBRegressor
R2 score: 0.627
Mean Squared Error score: 904576.535
Mean Absolute Error score: 214274.547
Median absolute error: 69425.125


In [17]:
model_RF = RandomForestRegressor(n_jobs=-1, n_estimators=250)
model_RF.fit(X_train, y_train)
print_scores(model_RF, X_test, y_test)

RandomForestRegressor
R2 score: 0.652
Mean Squared Error score: 874166.141
Mean Absolute Error score: 181493.184
Median absolute error: 58649.726


In [88]:
# Monday testing of RF
model_RF = RandomForestRegressor(n_jobs=-1, n_estimators=250)
model_RF.fit(X_train, y_train)
print_scores(model_RF, X_test, y_test)

RandomForestRegressor
R2 score: 0.634
Mean Squared Error score: 861916.143
Mean Absolute Error score: 173441.367
Median absolute error: 51954.074


In [120]:
# Monday testing of RF
# Adding in latest data
model_RF = RandomForestRegressor(n_jobs=-1, n_estimators=250)
model_RF.fit(X_train, y_train)
print_scores(model_RF, X_test, y_test)

RandomForestRegressor
R2 score: 0.633
Mean Squared Error score: 882900.931
Mean Absolute Error score: 174574.753
Median absolute error: 51640.634


In [90]:
model_RI = Ridge()
model_RI.fit(X_train, y_train)
print_scores(model_RI, X_test, y_test)

Ridge
R2 score: 0.091
Mean Squared Error score: 1411956.987
Mean Absolute Error score: 369244.183
Median absolute error: 137308.328


In [None]:
model_SR = SVR()
model_SR.fit(X_train, y_train)
print_scores(model_SR, X_test, y_test)

In [109]:
model_LA = Lasso(alpha=35)
model_LA.fit(X_train, y_train)
print_scores(model_LA, X_test, y_test)

Lasso
R2 score: 0.091
Mean Squared Error score: 1411877.903
Mean Absolute Error score: 368937.567
Median absolute error: 136984.537




In [35]:
np.mean([56420.531, 58649.726])

57535.128500000006

In [19]:
ensemble = [model_XG, model_RF]#, model_SR]# model_LA, model_LR, model_RI]
y_hat_ensemble = pd.DataFrame([model.predict(X_test) for model in ensemble]).transpose()
e_X_train, e_X_test, e_y_train, e_y_test = train_test_split(y_hat_ensemble, y_test)

In [59]:
y_hat_ensemble.shape

(163890, 2)

In [60]:
y_hat_ensemble.iloc[0:1]

Unnamed: 0,0,1
0,142651.625,150129.84


In [20]:
combiner = XGBRegressor(learning_rate=0.1, n_estimators=500, max_depth=8)
combiner.fit(e_X_train, e_y_train)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=8,
       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [22]:
combiner_RF = RandomForestRegressor(n_jobs=-1, n_estimators=250)
combiner_RF.fit(e_X_train, e_y_train)
print_scores(combiner_RF, e_X_test, e_y_test)

RandomForestRegressor
R2 score: 0.665
Mean Squared Error score: 889854.248
Mean Absolute Error score: 186831.209
Median absolute error: 63886.400


In [None]:
# XGBRegressor(learning_rate=0.1, n_estimators=400, max_depth=8)
print_scores(combiner, e_X_test, e_y_test)

In [21]:
# XGBRegressor(learning_rate=0.1, n_estimators=500, max_depth=8)
print_scores(combiner, e_X_test, e_y_test)

XGBRegressor
R2 score: 0.665
Mean Squared Error score: 890843.912
Mean Absolute Error score: 174703.812
Median absolute error: 56252.406


In [72]:
np.mean(y)

533777.9203366898