# 0. Prepare environment

In [73]:
import pandas as pd
import numpy as np

def rmsle(y_pred, y_test) : 
    assert len(y_test) == len(y_pred)
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_test))**2))

# 1. Load data from disk

In [74]:
df_finalTrain = pd.read_csv('ProcessedTrain.csv')

df_finalTest = pd.read_csv('ProcessedTest.csv')

target_train = pd.read_csv('../data/train.csv').SalePrice

# 2. Reduce number of features using Lasso regression

# 3. Try a decision tree

In [75]:
from sklearn import tree
from sklearn.metrics import mean_squared_error

clf = tree.DecisionTreeRegressor()

clf = clf.fit(df_finalTrain, target_train.apply(np.log))

predictedVals = clf.predict(df_finalTrain)

# What is the average difference between the real and predicted log(Price)?
print(mean_squared_error(y_pred=np.exp(predictedVals), y_true=target_train))
print(rmsle(np.exp(predictedVals), target_train))

37.37169017306895
2.7679039048583198e-05


# 4. Try gradient boosting

In [76]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

# Cross-validate the results to optimize the max_depth of the trees:
clfs = [0]
fitScores = [0]
for i in range(1,7):
    clfs.append(GradientBoostingRegressor(max_depth=i, random_state=0, warm_start=False, n_estimators=1000, learning_rate=.01))
    fitScores.append(cross_val_score(clfs[i], df_finalTrain, target_train.apply(np.log), cv=5, n_jobs=7).mean())
    print(str(i) + " : " + str(fitScores[i]))

bestDepth = fitScores.index(max(fitScores))
print(bestDepth)

1 : 0.8548146783795809
2 : 0.8924664938056918
3 : 0.8981911684866624
4 : 0.8974180287231409
5 : 0.893971800980785
6 : 0.8901928071597174
3


In [77]:
clf = clfs[bestDepth]
clf.fit(df_finalTrain, target_train.apply(np.log))

predictedTrain = clf.predict(df_finalTrain)
predictedTest = clf.predict(df_finalTest)

In [78]:
# What is the average difference between the real and predicted log(Price)?
print(mean_squared_error(y_pred=np.exp(predictedTrain), y_true=target_train))
print(np.sqrt(mean_squared_error(y_pred=np.exp(predictedTrain), y_true=target_train)))
print(rmsle(predictedTrain, target_train.apply(np.log)))
df_finalTrain.shape

266644159.5643951
16329.242467560922
0.006730527819249037


(1460, 187)

# Write results to CSV in Kaggle format

In [79]:
results = pd.Series(predictedTest)

# test_predict is predicted prices from test set
# zips Id with price

id_price = list(zip(range(1461, 2920), np.exp(predictedTest)))

# Rounds price to 1 decimal place (as per sample submission example)
id_price = list(map(lambda x: [x[0], round(x[1], 1)], id_price))

# Create a dataframe
final_df = pd.DataFrame(id_price, columns=['Id', 'SalePrice'])


# Wrtie to csv
final_df.to_csv('submit_predictions.csv', index=False)