# Ames Housing Prices Prediction

In this notebook, I plan to show how to clean, explore, analyze and predict home prices based on permanent house qualities. I used only Linearrgression using total home sqrft ratio with overall qaulity of the house 

This notebook uses [data from Kaggle](https://www.kaggle.com/c/dsi-us-4-project-2-regression-challenge/data).

This is the final submitted [submitted  to Kaggle](https://www.kaggle.com/c/dsi-us-4-project-2-regression-challenge/leaderboard). 

In [None]:
# This file provide a basic exploration of ames house price dataset
# import necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mpl_toolkits
import seaborn as sns
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
import scipy.stats
from patsy import dmatrices
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn import metrics
import sqlite3
import os 
import csv
# Configure visual settings:
%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10.0, 8.0) 
plt.style.use('ggplot')
np.random.seed(2018)
sns.set()

# Loading the data and EDA

In [None]:
# load the data, which is given in csv format in Kaggle for both train and test
df = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [None]:
df.columns = df.columns.map(lambda x: x.replace(" ", "").replace("/",""))
df.head()[:2]

In [None]:
df.describe()

In [None]:
df.columns.sort_values()

In [None]:
pd.set_option('display.max_rows', 500)
print(df.dtypes.sort_values())

In [None]:
df['SalePrice'].hist(bins=30)
plt.title('sale price distribution')
plt.show()

In [None]:
salePrice = df['SalePrice'].copy()
salePrice.sort_values()

### Numerical Features

In [None]:
corr = df.select_dtypes(include=['float64','int64']).iloc[:, 2:].corr()
plt.figure(figsize=(12, 12))
sns.heatmap(corr, vmax=1, square=True)
plt.show()

In [None]:
corr

In [None]:
sns.regplot(x = 'OverallQual', y = 'SalePrice', data = df, color = 'Orange')
plt.show()

In [None]:
plt.figure(1)
f, axarr = plt.subplots(3, 2, figsize=(10, 9))
price = df.SalePrice.values
axarr[0, 0].scatter(df.GrLivArea.values, price)
axarr[0, 0].set_title('GrLiveArea')
axarr[0, 1].scatter(df.GarageArea.values, price)
axarr[0, 1].set_title('GarageArea')
axarr[1, 0].scatter(df.TotalBsmtSF.values, price)
axarr[1, 0].set_title('TotalBsmtSF')
axarr[1, 1].scatter(df['1stFlrSF'].values, price)
axarr[1, 1].set_title('1stFlrSF')
axarr[2, 0].scatter(df.TotRmsAbvGrd.values, price)
axarr[2, 0].set_title('TotRmsAbvGrd')
axarr[2, 1].scatter(df.MasVnrArea.values, price)
axarr[2, 1].set_title('MasVnrArea')
f.text(-0.01, 0.5, 'Sale Price', va='center', rotation='vertical', fontsize = 12)
plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(2, figsize=(9, 7))
plt.subplot(211)
plt.scatter(df.YearBuilt.values, price)
plt.title('YearBuilt')

plt.subplot(212)
plt.scatter(df.YearRemodAdd.values, price)
plt.title('YearRemodAdd')

fig.text(-0.01, 0.5, 'Sale Price', va = 'center', rotation = 'vertical', fontsize = 12)

plt.tight_layout()

### Categorical Features

In [None]:
print(df.select_dtypes(include=['object']).columns.values)

In [None]:
# Neighborhood
plt.figure(figsize=(12, 6))
sns.boxplot(x='Neighborhood', y='SalePrice', data=df)
xt = plt.xticks(rotation=45)
plt.show()

### Data Preprocessing

In [None]:
def list_nulls(df):
    for col in df.columns:
        print("NaN count: {}, {}, {}".format(df[col].isnull().sum(), col, df[col].dtype))

In [None]:
def list_data(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            print(df[col].value_counts())
    else:
        print(df[col].describe())

In [None]:
def impute_from(df, f_y, f_x, y_upper_limit=None, x_upper_limit=None):
    df = df.copy()
    df_full = df.loc[np.logical_not(df[f_y].isnull())]
    x = df_full.loc[:,f_x]
    y = df_full.loc[:,f_y]
    plt.scatter(x, y)
    plt.xlabel(f_x)
    plt.ylabel(f_y)
    plt.show()
    if x_upper_limit is not None and y_upper_limit is not None:
        mask = (x < x_upper_limit) & (y <= y_upper_limit)
    else:
        mask = np.ones_like(x, dtype=bool)
        p = np.polyfit(x[mask], y[mask], 1) # degree 1
        df.loc[df[f_y].isnull(), f_y] = np.polyval(p, df.loc[df[f_y].isnull(), f_x])
    return df

In [None]:
df = pd.read_csv('./data/train.csv')
df.columns = df.columns.map(lambda x: x.replace(" ", "").replace("/",""))
df = df.drop(['Order','PID'], axis=1)

In [None]:
def format_train_test_data(train, test):
    
    # replace Alley NA with None
    train.Alley.fillna('None', inplace=True)
    test.Alley.fillna('None', inplace=True)

    # replace MasVnrArea na with 0
    train.MasVnrArea.fillna(0, inplace=True)
    test.MasVnrArea.fillna(0, inplace=True)

    # replace NAs from not having a basement, freplace, garage, pool, miscellaneous feature
    # fence with None
    na_columns = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'FireplaceQu', 'GarageType', \
               'BsmtFinType1', 'BsmtFinType2', 'GarageYrBlt', 'GarageFinish', \
               'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']

    for col in na_columns:
        train[col].fillna('None', inplace=True)
        test[col].fillna('None', inplace=True)

    # replace continous  feature NAs 
    # fill these with zero
    fill_zero = ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 
                 'BsmtHalfBath', 'GarageCars', 'GarageArea']
    for col in fill_zero:
        train[col].fillna(0, inplace=True)
        test[col].fillna(0, inplace=True)

    
    # replace LotFrontage NA with mean
    train.LotFrontage.fillna(train.LotFrontage.mean(), inplace=True)
    test.LotFrontage.fillna(test.LotFrontage.mean(), inplace=True)

    # apply log to skewed columns
    numeric_columns = train._get_numeric_data().columns
    skewed = []
    for col in numeric_columns:
        if (train[col].skew() < -2) or (train[col].skew() > 2):
            skewed.append(col)
    for col in skewed:
        train[col] = train[col] + 1
        test[col] = test[col] + 1
        train[col] = np.log10(train[col])
        test[col] = np.log10(test[col])
    
    # apply log10 transformation to Sale Price
    train.SalePrice = np.log10(train.SalePrice)
    
     # concatenate dataframes
    houses = pd.concat([train.drop('SalePrice', axis=1), test]).reset_index(drop=True)
    # get dummy variables
    houses_dummy = pd.get_dummies(houses)
    # sort by Id to slice out test data accurately
    houses_dummy.sort_values(by='Id', inplace=True)
    # split train and test data
    train_dummy = houses_dummy.iloc[:1459, :].copy()
    test_dummy = houses_dummy.iloc[1460:, :].copy()
    # add Sale Price column back to train dataframe
    train_dummy['SalePrice'] = train.SalePrice
    
    return train_dummy, test_dummy


In [None]:
train_dummy, test_dummy = format_train_test_data(train, test)

In [None]:
train.tail(2)

In [None]:
train_dummy.tail(2)

In [None]:
test.head(2)

In [None]:

test_dummy.head(2)

In [None]:
train_dummy.shape, test_dummy.shape

In [None]:
train_dummy.SalePrice.hist()

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

X = train_dummy.iloc[:, train_dummy.columns != 'SalePrice']
y = train_dummy.SalePrice

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2018)

# Accuracy

In [None]:
from sklearn.metrics import mean_squared_error
def mylog_rmse(y_true, y_pred):
    """
    Return log root mean square error
    Assumes y is already in log scale
    Input: tuple of True values of y, Predicted values of y
    Output: log rmse
    """
    mse = mean_squared_error(y_true, y_pred)
    return np.sqrt(mse)

# Checking Outliers

In [None]:
def skewed_columns(train):
    numeric_columns = train._get_numeric_data().columns
    skewed_columns = []
    for column in numeric_columns:
        if column in X_train.columns:
            if (X_train[column].skew() < -2) or (X_train[column].skew() > 2):
                skewed_columns.append(column)
    return skewed_columns

In [None]:
skewed_columns(train)

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(10,5))

for column, ax in zip(skewed_columns(train), axes.ravel()):
    ax.hist(train_dummy[column])
    ax.set_title('Histogram of \n{}'.format(column), fontsize=8)
fig.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(10,5))

for column, ax in zip(skewed_columns(train), axes.ravel()):
    ax.scatter(train_dummy[column], train_dummy.SalePrice)
    ax.set_title('Sale Price vs \n{}'.format(column), fontsize=8)
fig.tight_layout()
plt.show()

<br> As can be seen above most of the  extreme values seem to appear in variables that most houses do not possess hence many zeroes and scattered extreme values<br>

# Outliers of dataset

In [None]:
train_dummy.GrLivArea.astype(int)

In [None]:
train_dummy.Id.fillna(0)

In [None]:
fig, ax = plt.subplots()
ax.scatter(train_dummy.GrLivArea, train_dummy.SalePrice, alpha=0.5)
ax.set(xlabel='Gr Liv Area', ylabel='Sale Price')
for point_id, x, y in zip(train_dummy.Id, train_dummy.GrLivArea, train_dummy.SalePrice):
    if x > 7000:
        # outliers -> GrLivArea above 7000
        ax.annotate(point_id, xy=(x, y), xytext=(x+1000, y+0.5), arrowprops=dict(arrowstyle = '->'))

plt.show()

In [None]:
# removing these points
train_dummy = train_dummy.loc[train_dummy.GrLivArea < 4000, :].copy()
train_dummy.head()

# Ridge Regression

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import Ridge, ElasticNet, Lasso
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Ridge with standardized X

In [None]:
def do_grid_search(estimator, param_grid):
    
    grid_search = GridSearchCV(estimator, 
                               param_grid, 
                               n_jobs=-1, 
                               cv=5, 
                               scoring='mse')
    %time grid_search.fit(X_train, y_train)

    print("Test set log adj RMSE: {:.5f}".format(mylog_rmse(y_test, grid_search.predict(X_test))))
    print("Best parameters: {}".format(grid_search.best_params_))
    print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

    results = pd.DataFrame(grid_search.cv_results_)
    
    return grid_search, results

In [None]:
grid_search = GridSearchCV(estimator, 
                               param_grid, 
                               n_jobs=-1, 
                               cv=5, 
                               scoring='mylog_rmse')

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
param_grid={'ridge__alpha': [0.1, 1, 10] + list(range(20,200,20)) + [1000]
           }
grid_search, results = do_grid_search(estimator = make_pipeline(StandardScaler(), Ridge(random_state=0)), 
                                      param_grid = param_grid)

In [None]:
# def do_grid_search(estimator, param_grid):
#     grid_search = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=5, scoring='neg_mean_squared_error')
#     %time grid_search.fit(X_train, y_train)
#     print("Test set log adj RMSE: {:.5f}".format(mylog_rmse(y_test, grid_search.predict(X_test))))
#     print("Best parameters: {}".format(grid_search.best_params_))
#     print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
#     results = pd.DataFrame(grid_search.cv_results_)
#     return grid_search, results

In [None]:
do_grid_search(estimator = make_pipeline(StandardScaler(), Ridge(random_state=0)), 
                                      param_grid = param_grid)

In [None]:
param_grid={'ridge__alpha': [0.1, 1, 10] + list(range(20,200,20)) + [1000]
           }
grid_search, results = do_grid_search(estimator = make_pipeline(StandardScaler(), Ridge(random_state=0)), 
                                      param_grid = param_grid)

In [None]:
results.head()