In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [2]:
train = pd.read_csv('data/train_2.csv')
test = pd.read_csv('data/test_2.csv')

# Save the 'Id' column (important for submission)
train_ID = train['Id']
test_ID = test['Id']

# Id is not needed for prediction
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

# 0. Warm up

In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,...,SalePrice,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,0,856,854,0,,3,1Fam,TA,No,706.0,...,208500.0,WD,0,Pave,8,856.0,0,2003,2003,2008
1,1,1262,0,0,,3,1Fam,TA,Gd,978.0,...,181500.0,WD,0,Pave,6,1262.0,298,1976,1976,2007
2,2,920,866,0,,3,1Fam,TA,Mn,486.0,...,223500.0,WD,0,Pave,6,920.0,0,2001,2002,2008
3,3,961,756,0,,3,1Fam,Gd,No,216.0,...,140000.0,WD,0,Pave,7,756.0,0,1915,1970,2006
4,4,1145,1053,0,,4,1Fam,TA,Av,655.0,...,250000.0,WD,0,Pave,9,1145.0,192,2000,2000,2008


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458 entries, 0 to 1457
Data columns (total 80 columns):
Unnamed: 0       1458 non-null int64
1stFlrSF         1458 non-null int64
2ndFlrSF         1458 non-null int64
3SsnPorch        1458 non-null int64
Alley            1458 non-null object
BedroomAbvGr     1458 non-null int64
BldgType         1458 non-null object
BsmtCond         1458 non-null object
BsmtExposure     1458 non-null object
BsmtFinSF1       1458 non-null float64
BsmtFinSF2       1458 non-null float64
BsmtFinType1     1458 non-null object
BsmtFinType2     1458 non-null object
BsmtFullBath     1458 non-null float64
BsmtHalfBath     1458 non-null float64
BsmtQual         1458 non-null object
BsmtUnfSF        1458 non-null float64
CentralAir       1458 non-null object
Condition1       1458 non-null object
Condition2       1458 non-null object
Electrical       1458 non-null object
EnclosedPorch    1458 non-null int64
ExterCond        1458 non-null object
ExterQual        145

## 1. EDA (Exploratory Data Anaylsis)

# Feature Engineering

**IMPORTANT** note: When you transform (change/delete/add columns) your train data you need to apply the same transformations to your test data. This is because your model will learn to make predictions with the transformed train data. It doesn't know how to handle untransformed data.

In [6]:
ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.SalePrice.values
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['SalePrice'], axis=1, inplace=True)

## Encoding of categorical variables

**Label encoding**: Each catgeory is assigned to one number (order adds information)  
**One-hot enconding**: Each cageory ais represented as one separat feature (no specific order; blows up the feature space)

In [None]:
all_data.shape

In [None]:
# Which colums should be label encoded?
from sklearn.preprocessing import LabelEncoder
#cols = ('..', '...', )
#for c in cols:
#    all_data[c] = LabelEncoder().fit_transform(all_data.loc[:,c])

In [7]:
all_data.shape

(2917, 79)

In [None]:
# All other columns will be one hot encoded
all_data = pd.get_dummies(all_data) # also doable with sklearn.preprocessing.OneHotEncoder but more complicated

In [None]:
all_data.shape # Feature space became quite big

### You can add new features (xxample)

In [None]:
# Adding total sqfootage feature
#all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

# Model building

In [None]:
train = all_data[:ntrain]
test = all_data[ntrain:]

In [None]:
train.shape

## Cross validation

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
import math

In [None]:
y_train_log = np.log1p(y_train)

def rmse_cv(model):
    return (np.sqrt(-cross_val_score(model, train.values, y_train_log, scoring="neg_mean_squared_error", cv = 5)))

In [None]:
model_rf = RandomForestRegressor()

In [None]:
score = rmse_cv(model_rf)
print("Random Forest score: {:.4f}".format(score.mean()))

In [None]:
model_rf.fit(train,y_train_log)

In [None]:
f_i = model_rf.feature_importances_

In [None]:
feature_importance = pd.DataFrame(data=f_i, index=train.columns)
feature_importance_top_15 = feature_importance.sort_values(by=0).nlargest(15,0)

In [None]:
sns.barplot(y=feature_importance_top_15.index,x=feature_importance_top_15.loc[:, 0])

## Hyperparameter optimization

In [None]:
from sklearn.model_selection import GridSearchCV

# 4. Make a Submission

In [None]:
prediction = model_rf.predict(test)

In [None]:
prediction = np.exp(prediction)

In [None]:
prediction = pd.DataFrame(data=prediction, index=test_ID, columns=['SalePrice'])

In [None]:
prediction.head()

In [None]:
prediction.to_csv('submissions/submission_starter_2.csv')