In [None]:
# Import required libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
color = sns.color_palette()
sns.set_style('whitegrid')

import warnings
warnings.filterwarnings('ignore')

from scipy.stats import norm, probplot

In [None]:
# Inspect the training dataset
df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
df_train.head(5)

In [None]:
# Inspect the testing dataset
df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
df_test.head(5)

In [None]:
# Check the size of the datasets
print('Size of the training dataset: {}'.format(df_train.shape)) 
print('Size of the testing dataset: {}\n'.format(df_test.shape))

# Make sure there are no zero-values in the SalePrice column
df_train['SalePrice'].describe()

In [None]:
# Fitting params for SalePrice
print('Skewness: %f' % df_train['SalePrice'].skew())
print('Kurtosis: %f\n' % df_train['SalePrice'].kurt())

# Sale price histogram
sns.distplot(df_train['SalePrice'], fit = norm)
plt.ticklabel_format(style = 'plain', axis = 'y')
plt.ylabel('Frequency')

# Normal probability plot
fig = plt.figure()
res = probplot(df_train['SalePrice'], plot = plt)
plt.show()

In [None]:
# Correlation matrix for all characteristics
corrMat = df_train.corr()

# Determine which correlations are significant, and drop the others
corrValues = corrMat.unstack().abs()
c = [corrValues.drop(i, inplace = True) for i, v in corrValues.items()
     if i[0] == i[1]                           # left and right index are the same
     or v < 0.8                                # Value is insignificant
     or (i[1], i[0]) in corrValues.index]      # repeated correlation values

# Sort and print the correlation values
print('Characteristics with significant correlations:\n{}\n'.format(corrValues.sort_values(ascending = False)))

# Print the figure
sns.set(font_scale = 1.0, rc = {'figure.figsize': (12, 10)})
sns.heatmap(corrMat, vmax = 0.8, square = True)

In [None]:
# Saleprice correlation matrix
#     Only including characteristics with the top k corr values
#     Assuming correlation values >= 0.8 are important

corrMin = 0.5
corrSalePrice = corrMat[corrMat['SalePrice'].abs() >= corrMin]['SalePrice'].sort_values(ascending = False)
print('Characteristics with correlation values over {}:\n{}'.format(corrMin, corrSalePrice))

k = len(corrSalePrice)
cols = corrMat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)

sns.set(font_scale = 1.0, rc = {'figure.figsize': (10, 8)})
sns.heatmap(cm, cbar = True, annot = True, square = True,
            fmt = '0.2f', annot_kws = {'size': 10}, vmax = 0.8,
            xticklabels = cols.values, yticklabels = cols.values)
plt.show()