# Homework 03
## 2. House price Data Challenge

In [1]:
import pandas as pd
import numpy as np
import math

from sklearn.linear_model import LinearRegression
from os import listdir

In [2]:
df_train = pd.read_csv("train.csv")
df_test  = pd.read_csv("test.csv")

def preprocessDataFrame(df):
    df['garage'] = df['garage'].fillna(0)
    df['yrremodeled'] = df['yrremodeled'].fillna(0)

    # replace string label by numberical label
    dic = {'Farm': 1, 'Ranch':2, 'Cottage': 3, 'Town':4, 'Detached':5, 'Bungalow':6}

    df['housetype'] = df['housetype'].map(dic)

preprocessDataFrame(df_train)
preprocessDataFrame(df_test)

print(df_train.head(10))

   id  zipcode  floors  housetype  garage  lotarea  livingarea  Nbathroom  \
0   1    27131       1          6     0.0     1386         692          1   
1   2    27131       2          4   198.0     1950        1963          3   
2   3    27134       2          4     0.0      706        1053          1   
3   4    27119       3          3     0.0     1189        1808          3   
4   5    27119       2          5   245.0     2330        2845          2   
5   6    27137       2          4     0.0      928        1058          2   
6   7    27137       2          3   229.0     2320        2521          2   
7   8    27133       3          3     0.0     1750        2497          3   
8   9    27133       2          3     0.0     1415        2397          2   
9  10    27115       2          1     0.0     1367        1380          1   

   yearbuilt  yrremodeled  yearsale  monthsale   price  
0       1976          0.0      1996          1  162000  
1       1977          0.0      1996   

In [3]:
# cross validated linear predictive model
def LRfitCV(df, cols, tag=''):
    n2 = len(df['price'])//2

    regressor1 = LinearRegression()
    regressor1.fit(df[cols][:n2], df['price'][:n2])

    v1=regressor1.predict(df[cols][n2:])

    regressor2 = LinearRegression()
    regressor2.fit(df[cols][n2:], df['price'][n2:])

    v2=regressor1.predict(df[cols][:n2])

    df['LRprediction'] = list(v2) + list(v1)
    df['LRdiff'] = (df['LRprediction']-df['price'])/df['price']

    val = df['LRdiff'].abs().mean()

    print(tag, "average relative devitation", val)

    return val

LRfitCV(df_train,['zipcode', 'floors', 'housetype', 'garage', 'lotarea', 'livingarea', 'Nbathroom', 'yearbuilt', 'yearsale', 'monthsale'])

 average relative devitation 0.1541210325085161


0.1541210325085161

In [4]:
# create example submission
selected_columns = ['zipcode', 'floors', 'housetype', 'garage', 'lotarea', 
'livingarea', 'Nbathroom', 'yearbuilt', 'yearsale', 'monthsale']

regressor = LinearRegression()
regressor.fit(df_train[selected_columns], df_train['price'])

df_test['price'] = regressor.predict(df_test[selected_columns])

df_test[['id','price']].to_csv('example_submission.csv', index=False)

In [8]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

def fitBDT(df, cols, tag, ntrees= 500, ndepth= 15):
    """
    Fits a boosted decision tree to cols of df to reproduce tag of df
    
    Parameters:
    -----------
    df: Pandas Dataframe containing all data required for fitting

    cols: a list of strings, describing the input columns with which to fir the BDT

    tag: a string describing the column for which to fit 

    ntrees: integer, depth of the AdaBoostRegressor

    ndepth: integer, depth of the DecisionTreeRegressor

    Returns:
    --------
    AdaBoostRegressor
    """
    BDT = AdaBoostRegressor(n_estimators=ntrees, base_estimator=DecisionTreeRegressor(max_depth=ndepth))
    BDT.fit(df[cols], df[tag])
    return BDT


In [11]:
new_train = pd.read_csv("train.csv")
new_test  = pd.read_csv("test.csv")
preprocessDataFrame(new_train)
preprocessDataFrame(new_test) #not strictly needed, but usefull if we change the ones used further up

fit_cols= ['zipcode', 'floors', 'housetype', 'garage', 'lotarea', 'livingarea', 'Nbathroom', 'yearbuilt', 'yearsale', 'monthsale']
fit_goal= 'price'

regressor = fitBDT(new_train, fit_cols, fit_goal)
new_test['price'] = regressor.predict(new_test[fit_cols])
new_test[['id', 'price']].to_csv("markusMuellenmeister_submission.csv", index=False)

In [6]:

def BDT_fit_CV(df, cols, tag, ntrees= 1000, ndepth= 10):
    half = len(df[tag])//2  # determine halfwaypoint of dataframe

    BDT1 = fitBDT(df[:][:half], cols, tag, ntrees, ndepth)
    pred1= BDT1.predict(df[cols][half:])  # train BDT on fist half, use to reproduce second half

    BDT2 = fitBDT(df[:][half:], cols, tag, ntrees, ndepth)
    pred2= BDT2.predict(df[cols][:half])  # vice-versa with other half

    df['pred'] = list(pred2) + list(pred1)  # combine two prediction halves into one extra column
    df['diff'] = (df['pred']-df[tag])/df[tag]
    val = df['diff'].abs().mean()  # get average percentage difference
    print("(ntrees = ",ntrees,", ndepth = ",ndepth,")=> Average percentage difference: ", str(val)[:6])
    return val


In [7]:
  # %%timeit -r1 -n1
new_train = pd.read_csv("train.csv")
preprocessDataFrame(new_train)  # not strictly needed, but usefull if we change the ones used further up
fit_cols= ['zipcode', 'floors', 'housetype', 'garage', 'lotarea', 'livingarea', 'Nbathroom', 'yearbuilt', 'yearsale', 'monthsale']
fit_goal= 'price'
for t in [100, 500, 700, 1000]:
    for d in [10, 13, 15, 17, 20]:
        %time BDT_fit_CV(new_train, fit_cols, fit_goal, ntrees=t, ndepth=d)

(ntrees =  100 , ndepth =  10 )=> Average percentage difference:  0.0890
Wall time: 1.56 s
(ntrees =  100 , ndepth =  13 )=> Average percentage difference:  0.0782
Wall time: 1.68 s
(ntrees =  100 , ndepth =  15 )=> Average percentage difference:  0.0774
Wall time: 1.76 s
(ntrees =  100 , ndepth =  17 )=> Average percentage difference:  0.0776
Wall time: 1.77 s
(ntrees =  100 , ndepth =  20 )=> Average percentage difference:  0.0767
Wall time: 1.84 s
(ntrees =  500 , ndepth =  10 )=> Average percentage difference:  0.0883
Wall time: 7.39 s
(ntrees =  500 , ndepth =  13 )=> Average percentage difference:  0.0777
Wall time: 8.42 s
(ntrees =  500 , ndepth =  15 )=> Average percentage difference:  0.0766
Wall time: 8.69 s
(ntrees =  500 , ndepth =  17 )=> Average percentage difference:  0.0764
Wall time: 9.3 s
(ntrees =  500 , ndepth =  20 )=> Average percentage difference:  0.0765
Wall time: 9.21 s
(ntrees =  700 , ndepth =  10 )=> Average percentage difference:  0.0884
Wall time: 10.7 s


In [12]:
df = pd.read_csv("example_submission.csv")
tmp = pd.read_csv("markusMuellenmeister_submission.csv")
df['pred'] = tmp['price']
df['diff'] = (df['pred']-df['price'])/df['price']
print(df['diff'].abs().mean())
df.head(20)

0.21588114834417702


Unnamed: 0,id,price,pred,diff
0,5001,178840.192056,196000.0,0.095951
1,5002,126237.083519,126400.0,0.001291
2,5003,105365.939791,150500.0,0.428355
3,5004,671534.079901,682000.0,0.015585
4,5005,281313.604443,226600.0,-0.194493
5,5006,107970.907511,133166.666667,0.233357
6,5007,626401.934568,621500.0,-0.007826
7,5008,375247.108644,303948.275862,-0.190005
8,5009,435439.921966,434000.0,-0.003307
9,5010,522274.971811,530000.0,0.014791
