## Library imports, data imports, and initialisations

In [None]:
# Import required libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
color = sns.color_palette()
sns.set_style('whitegrid')

import warnings
warnings.filterwarnings('ignore')

from scipy.stats import norm, probplot
from sklearn.preprocessing import StandardScaler

# Load in the training and testing datasets
df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

## Impute missing values
I've chosen to do this first, so that any effects it has on relationships or other aspects of this exploration/modeling can be accounted for early on.

In [None]:
def getMissingValues():
    missingValuesTotal = df_train.isnull().sum().sort_values(ascending = False)
    missingValuesPercent = (100 * df_train.isnull().sum() / df_train.isnull().count()).sort_values(ascending = False)

    missingValues = pd.concat([missingValuesTotal, missingValuesPercent], axis = 1, keys = ['# Missing', '% Missing'])
    return missingValues[missingValues['# Missing'] > 0]

getMissingValues()

In [None]:
# Fill values in the training and testing datasets for the column specified
def fillMissingValues(var, fill):
    df_train[var] = df_train[var].fillna(fill)
    df_test[var] = df_test[var].fillna(fill)
    
# PoolQC w/ missing values: 99.66%
# In data_description.txt, NaN values indicate there is no pool
fillMissingValues('PoolQC', 'NA')

# MiscFeature w/ missing values: 96.29%
# In data_description.txt, NaN values indicate there are no misc features
fillMissingValues('MiscFeature', 'NA')

# Alley w/ missing values: 93.75%
# In data_description.txt, NaN values indicate there is no alley access
fillMissingValues('Alley', 'NA')

# Fence w/ missing values: 80.76%
# In data_description.txt, NaN values indicate there is no fence
fillMissingValues('Fence', 'NA')

# FireplaceQu w/ missing values: 47.42%
# In data_description.txt, NaN values indicate there is no fireplace
fillMissingValues('FireplaceQu', 'NA')

# LotFrontage w/ missing values: 17.8%
# In data_description.txt, no substitution is given for NaN LotFrontage characteristics; thus I'm going to use the median values for the row's corresponding neighbourhood.
# This may have issues when a there is a NaN value in a neighbourhood with no other samples, but for now this is not the case.
df_train['LotFrontage'] = df_train.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

# GarageCond, GarageFinish, GarageType, GarageQual, GarageYrBlt w/ missing values: 5.57%;
# In data_description.txt, NaN values indicate there is no garage
for cVar in ['GarageCond', 'GarageType', 'GarageFinish', 'GarageQual']:
    fillMissingValues(cVar, 'NA')
fillMissingValues('GarageYrBlt', 0)

# BsmtExposure, BsmtFinType2 w/ missing values: 2.61%; BsmtCond, BsmtFinType1, BsmtQual w/ missing values: 2.54%
# In data_description.txt, NaN values indicate there is no basement
for cVar in ['BsmtExposure', 'BsmtFinType2', 'BsmtCond', 'BsmtFinType1', 'BsmtQual']:
    fillMissingValues(cVar, 'NA')

# MasVnrArea, MasVnrType w/ missing values: 0.55%
# In data_description.txt, NaN values indicate there is no masonry vaneer
fillMissingValues('MasVnrArea', 0)
fillMissingValues('MasVnrType', 'NA')

# Electrical w/ missing values: 0.07%
# In data_description.txt, no substitution is given for NaN Electrical characteristics; thus I assume the one missing value is an error to be dropped.
df_train = df_train.drop(df_train[df_train['Electrical'].isnull()].index)

# Confirm that all missing values have been fixed
getMissingValues()

## Clean data types
Change all categorical columns are as noted as numbers to strings.

In [None]:
cols_numToObj = ['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt',
                 'YearRemodAdd', 'GarageYrBlt', 'MoSold', 'YrSold']

for col in cols_numToObj:
    df_train[col] = df_train[col].apply(str)
    df_test[col] = df_test[col].apply(str)

## Preliminary correlation inspections
Review correlation values between each set of characteristics, to identify if there are any redundant columns that could be dropped.

In [None]:
# Correlation matrix for all characteristics
corrMat = df_train.corr()

# Determine which correlations are significant, and drop the others
# Assuming correlation values >= 0.8 are significant
corrValues = corrMat.unstack().abs()
c = [corrValues.drop(i, inplace = True) for i, v in corrValues.items()
     if i[0] == i[1]                           # left and right index are the same
     or v < 0.8                                # Value is insignificant
     or (i[1], i[0]) in corrValues.index]      # repeated correlation values

# Sort and print the correlation values
print('Characteristics with significant correlations:\n{}\n'.format(corrValues.sort_values(ascending = False)))

# Print the figure
sns.set(font_scale = 1.0, rc = {'figure.figsize': (12, 10)})
sns.heatmap(corrMat, vmax = 0.8, square = True)

In [None]:
# Saleprice correlation matrix
#     Only including characteristics with the top k correlation values

corrMin = 0.5
corrSalePrice = corrMat[corrMat['SalePrice'].abs() >= corrMin]['SalePrice'].sort_values(ascending = False)
print('Characteristics with correlation values over {}:\n{}\n'.format(corrMin, corrSalePrice.drop('SalePrice')))

cols = corrMatt.index
print(cols)
cm = np.corrcoef(df_train[cols].values.T)

k = len(corrSalePrice)
#cols = corrMat.nlargest(k, 'SalePrice')['SalePrice'].index
#cm = np.corrcoef(df_train[cols].values.T)

sns.set(font_scale = 1.0, rc = {'figure.figsize': (10, 8)})
sns.heatmap(cm, cbar = True, annot = True, square = True,
            fmt = '0.2f', annot_kws = {'size': 10}, vmax = 0.8,
            xticklabels = cols.values, yticklabels = cols.values)

## Initial data exploration

In [None]:
# Check the size of the datasets
print('Size of the training dataset: {}'.format(df_train.shape))
print('Size of the testing dataset: {}\n'.format(df_test.shape))

# Make sure there are no zero-values in the SalePrice column
df_train['SalePrice'].describe()

There are no zero-values, which is great for modelling. However, the results above indicate that there will be a right skew.

In [None]:
# Fitting params for SalePrice
print('Skewness: %f' % df_train['SalePrice'].skew())
print('Kurtosis: %f\n' % df_train['SalePrice'].kurt())

# Sale price histogram
sns.distplot(df_train['SalePrice'], fit = norm)
plt.ticklabel_format(style = 'plain', axis = 'y')
plt.ylabel('Frequency')

# Normal probability plot
fig = plt.figure()
res = probplot(df_train['SalePrice'], plot = plt)
plt.show()

Right skew (positive) confirmed. Noted for later, so that I can apply log transformations to normalise SalePrice.

Very strong correlations between the following:
* GarageArea & GarageCars
* GarageYrBlt & YearBuilt
* TotRmsAbvGrd & GrLivArea
* 1stFlrSF & TotalBsmtSF

I'll check which characteristic in each of the above pairs has a stronger correlation with SalePrice, and remove the other column.

Referring to the previous strong correlations between characteristics (not inc. SalePrice):
* GarageArea (0.623431) & GarageCars (0.640409)
* GarageYrBlt (less than 0.5)  & YearBuilt (0.522897)
* TotRmsAbvGrd (0.533723) & GrLivArea (0.708624)
* 1stFlrSF (0.605852) & TotalBsmtSF (0.613581)

Therefore the following columns will be removed later:
* GarageArea
* GarageYrBlt
* TotRmsAbvGrd
* 1stFlrSF

The following characteristics are worth exploring further:
* Numerical variables
  * GrLivArea
  * GarageCars
  * TotalBsmtSF
  * FullBath

* Categorical variables
  * OverallQual
  * YearBuilt
  * YearRemodAdd

In [None]:
# Fixing Seaborn's styling after it gets reset by the previous correlation matrices
sns.set_style('whitegrid')

# Numerical variables
variables = ['GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath']
for var in variables:
    plt.scatter(x = df_train[var], y = df_train['SalePrice'])
    plt.title('Relationship between {} and {}'.format(var, 'SalePrice'))
    plt.xlabel(var)
    plt.ylabel('SalePrice')
    plt.show()
    
# Numerical variables
variables = ['OverallQual', 'YearBuilt', 'YearRemodAdd']
for var in variables:
    sns.boxplot(x = df_train[var], y = df_train['SalePrice'])
    plt.title('Relationship between {} and {}'.format(var, 'SalePrice'))
    plt.xlabel(var)
    plt.ylabel('SalePrice')
    plt.xticks(rotation = 90)
    plt.show()

The relationships are as expected, though there are a handful of outliers that will likely need to be dealt with. 

* GrLivArea and TotalBsmtSF have obvious outliers to the right of the plot, these will be excluded
* YearBuilt and YearRemodAdd both have outliers near the top of the plot, these may be excluded

I shall remove these outliers, but ignore outliers in characteristics not mentioned above (because their correlation to SalePrice is much lower, their outliers shouldn't have as much negative impact).

In [None]:
originalLen = len(df_train)

# Two outliers within the GrLivArea characteristic
df_train = df_train.drop(df_train[(df_train['GrLivArea'] > 4500) & (df_train['SalePrice'] < 200000)].index)

# One outlier within the TotalBsmtSF characteristic
df_train = df_train.drop(df_train[(df_train['TotalBsmtSF'] > 6000) & (df_train['SalePrice'] < 200000)].index)

# Three outliers within the YearBuilt characteristic
df_train = df_train.drop(df_train[(df_train['YearBuilt'] < 2000) & (df_train['SalePrice'] > 600000)].index)

# Three outliers within the YearRemodAdd characteristic
df_train = df_train.drop(df_train[(df_train['YearRemodAdd'] < 2000) & (df_train['SalePrice'] > 600000)].index)

finalLen = len(df_train)
print('Original training size: {}\nNew training size: {}\n'.format(originalLen, finalLen))

.
.
.
.
.
.
.
.
.
.
