In [1]:
# Imports.
import math
import pandas as pd
import numpy as np
import matplotlib.patches as mpatches # For graph colours
import statsmodels.formula.api as sm # for training a linear regression model.
import seaborn as sns # For making correlation matrices
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix # for making confusion martrices Note: Not needed.
from sklearn.metrics import accuracy_score # Prediction accuracy: (tp + tn) / total
from sklearn.metrics import precision_score # Computing precision: tp / (tp + fp)
from sklearn.metrics import recall_score # Recall Score: tp / (tp + fn)
from sklearn.metrics import f1_score # F1 = 2 * (precision * recall) / (precision + recall)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score

#For showing plots directly in the notebook run the command below
%matplotlib inline
%config IPCompleter.greedy=True

# Prevents tables from being truncated.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)



In [2]:
# Read csv file into a dataframe.
df = pd.read_csv('train.csv')

In [3]:
df.shape

(1460, 81)

In [4]:
df.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [5]:
df.tail(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,1999,2000,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,Unf,0,Unf,0,953,953,GasA,Ex,Y,SBrkr,953,694,0,1647,0,0,2,1,3,1,TA,7,Typ,1,TA,Attchd,1999.0,RFn,2,460,TA,TA,Y,0,40,0,0,0,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,6,6,1978,1988,Gable,CompShg,Plywood,Plywood,Stone,119.0,TA,TA,CBlock,Gd,TA,No,ALQ,790,Rec,163,589,1542,GasA,TA,Y,SBrkr,2073,0,0,2073,1,0,2,0,3,1,TA,7,Min1,2,TA,Attchd,1978.0,Unf,2,500,TA,TA,Y,349,0,0,0,0,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,9,1941,2006,Gable,CompShg,CemntBd,CmentBd,,0.0,Ex,Gd,Stone,TA,Gd,No,GLQ,275,Unf,0,877,1152,GasA,Ex,Y,SBrkr,1188,1152,0,2340,0,0,2,0,4,1,Gd,9,Typ,2,Gd,Attchd,1941.0,RFn,1,252,TA,TA,Y,0,60,0,0,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,6,1950,1996,Hip,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,TA,TA,Mn,GLQ,49,Rec,1029,0,1078,GasA,Gd,Y,FuseA,1078,0,0,1078,1,0,1,0,2,1,Gd,5,Typ,0,,Attchd,1950.0,Unf,1,240,TA,TA,Y,366,0,112,0,0,0,,,,0,4,2010,WD,Normal,142125
1459,1460,20,RL,75.0,9937,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,Norm,Norm,1Fam,1Story,5,6,1965,1965,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,CBlock,TA,TA,No,BLQ,830,LwQ,290,136,1256,GasA,Gd,Y,SBrkr,1256,0,0,1256,1,0,1,1,3,1,TA,6,Typ,0,,Attchd,1965.0,Fin,1,276,TA,TA,Y,736,68,0,0,0,0,,,,0,6,2008,WD,Normal,147500


In [6]:
df.select_dtypes(['object']).describe().T

Unnamed: 0,count,unique,top,freq
MSZoning,1460,5,RL,1151
Street,1460,2,Pave,1454
Alley,91,2,Grvl,50
LotShape,1460,4,Reg,925
LandContour,1460,4,Lvl,1311
Utilities,1460,2,AllPub,1459
LotConfig,1460,5,Inside,1052
LandSlope,1460,3,Gtl,1382
Neighborhood,1460,25,NAmes,225
Condition1,1460,9,Norm,1260


In [7]:
df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
Alley             object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
MasVnrType        object
MasVnrArea       float64
ExterQual         object
ExterCond         object
Foundation        object
BsmtQual          object
BsmtCond          object
BsmtExposure      object
BsmtFinType1      object
BsmtFinSF1         int64
BsmtFinType2      object
BsmtFinSF2         int64
BsmtUnfSF          int64
TotalBsmtSF        int64
Heating           object


By preliminary analysis:<br>
 - if features are of data type float or int, put them in a list called "continuous_features". 
 - If they are of data type object, convert them to categorical and put them in a list called "categorical_features".
 - The **Id** column has doesn't contain any useful information. Therefore, we dropped it here.

In [21]:
# Id is the index of the dataset, doesn't contain any useful information. Therefore, we dropped it here.
df = df.drop('Id',axis=1)

In [19]:
# Convert features of type 'object' to type 'category'
for column in df.select_dtypes(['object']).columns:
    df[column] = df[column].astype('category')

# Make list of categorical columns
categorical_features = df.select_dtypes(['category']).columns.tolist()

# Make list of continuous columns
continuous_features = df.select_dtypes(['int', 'float64']).columns.tolist()

In [9]:
# Number of features
print("Total number of features: ", len(df.columns))
print("Number of continuous: ", len(continuous_features))
print("Number of categorical: ", len(categorical_features))

Total number of features:  80
Number of continuous:  37
Number of categorical:  43


In [17]:
# Finiding duplicates:
print("Duplicate columns: ")
print(df.columns.size - df.columns.unique().size)

print("Duplicate rows:")
print(df.duplicated()[df.duplicated() == True].shape[0])

Duplicate columns: 
0
Duplicate rows:
0


In [11]:
# Checking for constant columns

# Print count of unique values for 'continuous_column'
print("\n-Continuous Columns-\n")
print("{0:30}{1:<15}".format("Feature", "UniqueValues"))
for column in continuous_features:
    print("{0:30}{1:<15}".format(column, len(df[column].unique())))
    
print("\n-Categorical Columns-\n")
print("{0:30}{1:<15}".format("Feature", "UniqueValues"))

# Print count of unique values for 'categorical_columns'
for column in categorical_features:
    print("{0:30}{1:<15}".format(column, len(df[column].unique())))


-Continuous Columns-

Feature                       UniqueValues   
MSSubClass                    15             
LotFrontage                   111            
LotArea                       1073           
OverallQual                   10             
OverallCond                   9              
YearBuilt                     112            
YearRemodAdd                  61             
MasVnrArea                    328            
BsmtFinSF1                    637            
BsmtFinSF2                    144            
BsmtUnfSF                     780            
TotalBsmtSF                   721            
1stFlrSF                      753            
2ndFlrSF                      417            
LowQualFinSF                  24             
GrLivArea                     861            
BsmtFullBath                  4              
BsmtHalfBath                  3              
FullBath                      4              
HalfBath                      3              
BedroomAbvG

In [23]:
df[continuous_features].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0
BsmtFinSF2,1460.0,46.549315,161.319273,0.0,0.0,0.0,0.0,1474.0


No constant columns in dataset. However, the continuous feature list does contain a number of features with low numbers of unique values.

In [12]:
# Features in the continuous list with low counts
low_count_continuous = ['MSSubClass', 'OverallQual', 'OverallCond', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
                        'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'PoolArea', 'MiscVal',
                        'MoSold', 'YrSold']

for column in low_count_continuous:
    print("{0:20} {1}".format(column, pd.unique(df[column].ravel())))

MSSubClass           [ 60  20  70  50 190  45  90 120  30  85  80 160  75 180  40]
OverallQual          [ 7  6  8  5  9  4 10  3  1  2]
OverallCond          [5 8 6 7 4 2 3 9 1]
BsmtFullBath         [1 0 2 3]
BsmtHalfBath         [0 1 2]
FullBath             [2 1 3 0]
HalfBath             [1 0 2]
BedroomAbvGr         [3 4 1 2 0 5 6 8]
KitchenAbvGr         [1 2 3 0]
TotRmsAbvGrd         [ 8  6  7  9  5 11  4 10 12  3  2 14]
Fireplaces           [0 1 2 3]
GarageCars           [2 3 1 0 4]
PoolArea             [  0 512 648 576 555 480 519 738]
MiscVal              [    0   700   350   500   400   480   450 15500  1200   800  2000   600
  3500  1300    54   620   560  1400  8300  1150  2500]
MoSold               [ 2  5  9 12 10  8 11  4  1  7  3  6]
YrSold               [2008 2007 2006 2009 2010]


Most of these values are continuous but low value. OverallQual and OverallCond are oridinal categorical. MoSold (month sold) is nominal.

MSSubClass (Nominal): Identifies the type of dwelling involved in the sale.	

```
020	1-STORY 1946 & NEWER ALL STYLES
030	1-STORY 1945 & OLDER
040	1-STORY W/FINISHED ATTIC ALL AGES
045	1-1/2 STORY - UNFINISHED ALL AGES
050	1-1/2 STORY FINISHED ALL AGES
060	2-STORY 1946 & NEWER
070	2-STORY 1945 & OLDER
075	2-1/2 STORY ALL AGES
080	SPLIT OR MULTI-LEVEL
085	SPLIT FOYER
090	DUPLEX - ALL STYLES AND AGES
120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
150	1-1/2 STORY PUD - ALL AGES
160	2-STORY PUD - 1946 & NEWER
180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
190	2 FAMILY CONVERSION - ALL STYLES AND AGES
```

Conclusion: MSSubClass, MoSold, OverallQual and OverallCond should be removed from the continuous features.

In [13]:
# Remove from continuous list
remove_from_continuous = ['MSSubClass', 'MoSold', 'OverallQual', 'OverallCond']
continuous_features = [x for x in continuous_features if x not in remove_from_continuous]

In [14]:
# Add to categorical
categorical_features.extend(remove_from_continuous)

In [15]:
# Remove index from categorical list
categorical_features.remove('Id')

ValueError: list.remove(x): x not in list

# 2 - Initial Findings

## Continuous Features

In [None]:
# Plot a histogram of the continuous features.
df[continuous_features].hist(figsize=(20,20), bins=20)
plt.show()

Several features have a huge count of a single value and a very small number of other values. These features may have to be excluded from the dataset. Features of note in this regard are:

* 3SsnPorch
* BsmtFinSF2
* BsmtHalfBath
* EnclosedPorch
* KitchenAbvGr
* LowQualFinSF
* MiscVal
* PoolArea
* ScreenPorch

In [None]:
# Plot box plots for all the continuous features
# Note: seaborn combined with matplotlib hide fliers (outliers). To avoid this, add sym="k." to the parameters
df[continuous_features].plot(kind='box', figsize=(15,30), subplots=True, layout=(9,4), sym="k.")
plt.show()

A few features appear to have extreme outliers that may have to be examined. These include:

* LotFrontage
* LotArea
* BsmtFinSF1
* TotalBsmtSF
* 1stFlrSF
* BsmtFullBath
* KitchenAbvGr
* EnclosedPorch
* PoolArea
* MiscVal

## Outliers

The following are the biggest ten values of the features identified to have outliers above:

In [None]:
# Searching for outliers - Upper tail
outliers = ['LotFrontage', 'LotArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', 'BsmtFullBath', 'KitchenAbvGr', 
            'EnclosedPorch', 'PoolArea', 'MiscVal']

# Code Source: Stackexchange - Zelazny7
# https://stackoverflow.com/questions/20477190/get-top-biggest-values-from-each-column-of-the-pandas-dataframe
def sorted(s, num):
    tmp = s.sort_values(ascending=False)[:num]  # earlier s.order(..)
    tmp.index = range(num)
    return tmp

df[outliers].apply(lambda x: sorted(x, 10)).T.round(2)

In [None]:
print("\nContinuous Features with outliers > 1.5 * upper 3rd quartile:\n")

print("{0:20} {1}".format('feaure', 'Max Value'))
print("-"*50)

for i in outliers:
    if df[i].max() > (df[i].quantile(.75) * 1.5):
        print("{0:20} {1}".format(i, df[i].max()))

## Categorical Features

In [None]:
# Unique values in categorical_columns
print("\nCategorical columns unique values:")
print("-"*100)

for column in categorical_features:
    print("{0:20} {1}".format(column, pd.unique(df[column].ravel())))

In [None]:
# Plot bar plots for all the categorical features
# Shape of categorical datasets is as follows:
fig, axes = plt.subplots(nrows=16, ncols=3, figsize=(25,70))
fig.subplots_adjust(hspace = 0.2)

axes_list = [item for sublist in axes for item in sublist]

for feature in categorical_features:
    ax = axes_list.pop(0)
    df[feature].value_counts().plot(kind='bar', ax=ax, grid=True, rot=1)
    ax.set_title(feature)

for ax in axes_list:
    ax.remove()

In [None]:
# The X axis labels on three of the above are hard to make out:
bigger_barcharts = ['Neighborhood', 'Exterior1st', 'Exterior2nd']

for feature in bigger_barcharts:
    df[feature].value_counts().plot(kind='bar', grid=True, rot=1, title=feature, figsize=(22,5))
    plt.show()

#### Low levels of variance

The following features had a very large number of a particular value and a very small number of other values:

```
['Street', 'LandContour', 'Utilities', 'LandSlope', 'RoofMatl', 'ExterCond', 'BsmtCond', 
 'BsmtFinType2', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'GarageQual', 'GarageCond', 
 'PavedDrive', 'MiscFeature', 'SaleType', 'SaleCondition']
```
#### Duplicate features?

What is the difference between the following features?
* Street & Ally  
_**Answer:** Street is the type of road access while Alley is the type of alley access._

* Condition1 & Condition2  
_**Answer:** Condition1 is the proximity to main road or railroad Condition2 is proximity to a 2nd main road or railroad (if there is more than one nearby)_

* Exterior1st & Exterior2nd  
_**Answer:** Exterior1st is the exterior covering on house, while Exterior2nd is the 2nd exterior covering on house (if more than one material)_

* BsmtFinType1 & BsmtFinType2  
_**Answer:** BsmtFinType1 is the quality of basement finished area, while BsmtFinType2 is the quality of second finished area (if present)._

* BsmtFinSF, BsmtFinSF1 & BsmtFinSF2  
_**Answer:** BsmtUnfSF is the unfinished square feet of basement area, BsmtFinSF1 is type 1 finished square feet and BsmtFinSF2 is type 2 finished square feet_

#### Nominal Categorical Features

```
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 
 'Neighborhood', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 
 'Foundation', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'CentralAir', 'Electrical', 'Functional', 
 'GarageType', 'GarageFinish', 'PavedDrive', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition', 
 'MSSubClass']
 ```
 
#### Ordinal Categorical Features

```
['LandSlope', 'Condition1', 'Condition2', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC', 'MoSold', 'OverallQual', 'OverallCond']
```

#### Binary Features

Utilities
Ally
CentralAir
PavedDrive - has unknown 3rd value 'P' in there too though!

## Null Values Counts

In [None]:
# Null values
df.isnull().sum()

#### Features with many null values

```
LotFrontage       259
Alley            1369
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinType2       38
FireplaceQu       690
GarageType         81
GarageYrBlt        81
GarageFinish       81
GarageQual         81
GarageCond         81
PoolQC           1453
Fence            1179
MiscFeature      1406
```

Where the number of null values is less than a hundred, the issue may be resolved by imputation. However, features missing higher numbers than this may need to be discarded. This will be dealt with on a feature by feature basis in the data cleaning section (Section 3).

## Mode Counts

In [None]:
# Mode - continuous features
print("\nContinuous features Mode Counts:")
print("-"*55)

print("{0:30} {1:<8} {2:<8} {3:<9}".format("Feature", "Mode", "Count", "%"))
print("-"*55)

for feature in continuous_features:
    feature_mode = df[feature].mode().iloc[0]
    mode_count = df[feature].value_counts()[feature_mode]
    print("{0:30} {1:<8.2f} {2:<8d} {3:<8.1f}".format(feature, feature_mode, mode_count, (mode_count/df[column].count())*100))

# Mode - continuous features
print("\nCategorical features Mode Counts:")
print("-"*55)

print("{0:30} {1:<8} {2:<8} {3:<9}".format("Feature", "Mode", "Count", "%"))
print("-"*55)

for feature in categorical_features:
    feature_mode = df[feature].mode().iloc[0]
    mode_count = df[feature].value_counts()[feature_mode]
    print("{0:30} {1:<10} {2:<8d} {3:<8.1f}".format(feature, feature_mode, mode_count, (mode_count/df[column].count())*100))


#### High Mode Count Features

There are a large number of features with a high percentage of a single value:

```
Continuous features

-------------------------------------------------------
Feature                        Mode     Count    %        
-------------------------------------------------------
BsmtFinSF2                     0.00     1293     88.6 
LowQualFinSF                   0.00     1434     98.2 
BsmtHalfBath                   0.00     1378     94.4 
KitchenAbvGr                   1.00     1392     95.3
EnclosedPorch                  0.00     1252     85.8    
3SsnPorch                      0.00     1436     98.4    
ScreenPorch                    0.00     1344     92.1    
PoolArea                       0.00     1453     99.5    
MiscVal                        0.00     1408     96.4 

Categorical features

-------------------------------------------------------
Feature                        Mode     Count    %        
-------------------------------------------------------
MSZoning                       RL         1151     78.8    
Street                         Pave       1454     99.6 
LandContour                    Lvl        1311     89.8    
Utilities                      AllPub     1459     99.9    
LotConfig                      Inside     1052     72.1    
LandSlope                      Gtl        1382     94.7  
Condition1                     Norm       1260     86.3    
Condition2                     Norm       1445     99.0    
BldgType                       1Fam       1220     83.6   
RoofStyle                      Gable      1141     78.2    
RoofMatl                       CompShg    1434     98.2    
ExterCond                      TA         1282     87.8    
BsmtCond                       TA         1311     89.8    
BsmtFinType2                   Unf        1256     86.0    
Heating                        GasA       1428     97.8    
CentralAir                     Y          1365     93.5    
Electrical                     SBrkr      1334     91.4    
Functional                     Typ        1360     93.2    
GarageQual                     TA         1311     89.8    
GarageCond                     TA         1326     90.8    
PavedDrive                     Y          1340     91.8    
SaleType                       WD         1267     86.8    
SaleCondition                  Normal     1198     82.1
```
Many of these features are nearly constant values and therefore likely have little predictive power.


# 3 - Resolving Data Quality Issues

# 4 - Feature Selection

In [None]:
# Correlation matrix using code found on https://stanford.edu/~mwaskom/software/seaborn/examples/many_pairwise_correlations.html
sns.set(style="white")

# Calculate correlation of all pairs of continuous features
corr = df[continuous_features].corr().round(2)

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(40, 40))

# Generate a custom colormap - blue and red
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, annot=True, mask=mask, cmap=cmap, vmax=1, vmin=-1,
            square=True, xticklabels=True, yticklabels=True,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
plt.yticks(rotation = 0)
plt.xticks(rotation = 90)

plt.show()

In [None]:
# Correlation of continuous variables with SalePrice, plotted.

fig, axes = plt.subplots(nrows=13, ncols=3, sharey=True, figsize=(15, 50))
fig.subplots_adjust(hspace = 0.5)

axes_list = [item for sublist in axes for item in sublist]

for feature in continuous_features:
    ax = axes_list.pop(0)
    
    df.plot(kind='scatter', x=feature, y='SalePrice', 
            label="%.3f" % df[[feature, 'SalePrice']].corr().as_matrix()[0,1], title=feature, ax=ax)

for ax in axes_list:
    ax.remove()  
    
plt.show()

# Linear Model

# Random Forest