# Select Features

This notebook selects the features to be used in the analysis.  At the end of the notebook, an output dataset called 'train_select' is created and will be used in the next notebook, Final_Data_Processing.  **You may run this entire notebook all at once.**

The features are categorized as GENERAL, HOUSE INTERIOR, and HOUSE EXTERIOR according to the aspects of the house they describe.  They are further classified into subcategories.

**Note**: CAT = categorical; CONT = continuous

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('./Data/train')
m = df.shape[0]

# GENERAL

## 1. TYPE

### a. MSSubClass (CAT)

In [3]:
def impute_MSSubClass(cols):
    Id = cols[0]
    MSSubClass = cols[1]
    
    if pd.isnull(MSSubClass):
        return 'NotPUD'
    else:
        if MSSubClass in (120,150,160,180):
            return 'PUD'
        else:
            return 'NotPUD'
        
df['MSSubClass'] = df[['Id','MSSubClass']].apply(impute_MSSubClass,axis=1)

### b. BldgType (CAT)

In [4]:
def impute_BldgType(cols):
    Id = cols[0]
    BldgType = cols[1]
    
    if pd.isnull(BldgType):
        return 'OthBldgType'
    else:
        if BldgType in ('TwnhsE','Twnhs'):
            return 'THBldgType'
        elif BldgType in ('Duplex','2fmCon'):
            return 'OthBldgType'
        else:
            return '1FamBldgType'
        
df['BldgType'] = df[['Id','BldgType']].apply(impute_BldgType,axis=1)

### c. HouseStyle (CAT)

In [5]:
def impute_HouseStyle(cols):
    Id = cols[0]
    HouseStyle = cols[1]
    
    if pd.isnull(HouseStyle):
        return 'OthStory'
    else:
        if HouseStyle == '1Story':
            return '1.0Story'
        elif HouseStyle in ('1.5Fin','1.5Unf'):
            return '1.5Story'
        elif HouseStyle == '2Story':
            return '2.0Story'
        else:
            return 'OthStory'
        
df['HouseStyle'] = df[['Id','HouseStyle']].apply(impute_HouseStyle,axis=1)

## 2. Location/Neighborhood

### a. MSZoning (CAT)

In [6]:
def impute_MSZoning(cols):
    Id = cols[0]
    MSZoning = cols[1]
    
    if pd.isnull(MSZoning):
        return 'OthZone'
    else:
        if MSZoning == 'RL':
            return 'RLZone'
        elif MSZoning == 'RM':
            return 'RMZone'
        else:
            return 'OthZone'
        
df['MSZoning'] = df[['Id','MSZoning']].apply(impute_MSZoning,axis=1)

### b. Neighborhood (CAT; changed to feature called, 'Quadrant')

In [7]:
def impute_Neighborhood(cols):
    Id = cols[0]
    Neighborhood = cols[1]
    
    if pd.isnull(Neighborhood):
        return 'Quad1'
    else:
        if Neighborhood in ('Blmngtn','BrDale','BrkSide','ClearCr','Gilbert','NAmes','NoRidge','NPkVill','NridgHt','NWAmes','OldTown','Somerst','StoneBr'):
            return 'Quad1'
        elif Neighborhood in ('Blueste','CollgCr','Crawfor','Edwards','SWISU','Timber'):
            return 'Quad3'
        elif Neighborhood in ('IDOTRR','MeadowV','Mitchel'):
            return 'Quad4'
        elif Neighborhood in ('Sawyer','SawyerW','Veenker'):
            return 'Quad2'
        else:
            return 'Quad1'
        
df['Quadrant'] = df[['Id','Neighborhood']].apply(impute_Neighborhood,axis=1)

In [8]:
# Make sure to drop feature:
df.drop('Neighborhood',axis=1,inplace=True)

### c. Condition1 (CAT)

In [9]:
def impute_Condition1(cols):
    Id = cols[0]
    Condition1 = cols[1]
    
    if pd.isnull(Condition1):
        return 'ANCond1'
    else:
        if Condition1 != 'Norm':
            return 'ANCond1'
        else:
            return 'NCond1'
 
df['Condition1'] = df[['Id','Condition1']].apply(impute_Condition1,axis=1)

### d. Condition2

In [10]:
# Make sure to drop feature:
df.drop('Condition2',axis=1,inplace=True)

## 3. Time

### a. YearBuilt (CONT)

In [11]:
#For continuous YearBuilt:

def impute_YearBuilt(cols):
    Id = cols[0]
    YearBuilt = cols[1]
    
    if pd.isnull(YearBuilt):
        return 1971 #This is the mean year for all training observations in training set (see above)
    else:
        return YearBuilt
 
df['YearBuilt'] = df[['Id','YearBuilt']].apply(impute_YearBuilt,axis=1)

### b. YearRemodAdd (CAT; feature changed to, 'Remodel')

In [12]:
def check_RemodAdd(cols):
    
    Id = cols[0]
    YearBuilt = cols[1]
    YearRemodAdd = cols[2]
    
    if pd.isnull(YearRemodAdd):
        return 'NRemod'     
    else:
        if YearBuilt == YearRemodAdd:
            return 'NRemod'
        else:
            return 'Remod'

df['Remodel'] = df[['Id','YearBuilt','YearRemodAdd']].apply(check_RemodAdd,axis=1)

In [13]:
# Make sure to drop original feature:
df.drop('YearRemodAdd',axis=1,inplace=True)

### c. MoSold

In [14]:
# Make sure to drop feature:
df.drop('MoSold',axis=1,inplace=True)

### d. YrSold (CAT)

In [15]:
def impute_YrSold(cols):
    
    Id = cols[0]
    YrSold = cols[1]
    
    if pd.isnull(YrSold):
        return '2008' # If missing value, impute as 2008 (mean year of training set)    
    else:
        return str(YrSold)

df['YrSold'] = df[['Id','YrSold']].apply(impute_YrSold,axis=1)

## 4.  Sale

### a. SaleType (CAT)

In [16]:
def impute_SaleType(cols):
    
    Id = cols[0]
    SaleType = cols[1]
    
    if pd.isnull(SaleType):
        return 'UnConvSale'    
    else:
        if SaleType == 'WD':
            return 'ConvSale'
        else:
            return 'UnConvSale'

df['SaleType'] = df[['Id','SaleType']].apply(impute_SaleType,axis=1)

### b. SaleCondition (CAT)

In [17]:
def impute_SaleCondition(cols):
    
    Id = cols[0]
    SaleCondition = cols[1]
    
    if pd.isnull(SaleCondition):
        return 'ANormSale'    
    else:
        if SaleCondition == 'Normal':
            return 'NormSale'
        else:
            return 'ANormSale'

df['SaleCondition'] = df[['Id','SaleCondition']].apply(impute_SaleCondition,axis=1)

## 5. Utilities

### a. Utilities

In [18]:
# Make sure to drop feature:
df.drop('Utilities',axis=1,inplace=True)

### b. Heating

In [19]:
# Make sure to drop feature:
df.drop('Heating',axis=1,inplace=True)

### c. HeatingQC (CAT)

In [20]:
def impute_HeatingQC(cols):
    
    Id = cols[0]
    HeatingQC = cols[1]
    
    if pd.isnull(HeatingQC):
        return 'AveHeat'    
    else:
        if HeatingQC == 'Ex':
            return 'ExcHeat'
        elif HeatingQC == 'Gd':
            return 'GoodHeat'
        elif HeatingQC in ('TA','Fa','Po'):
            return 'AveHeat'
        else:
            return 'AveHeat'

df['HeatingQC'] = df[['Id','HeatingQC']].apply(impute_HeatingQC,axis=1)

### d. CentralAir (CAT)

In [21]:
def impute_CentralAir(cols):
    
    Id = cols[0]
    CentralAir = cols[1]
    
    if pd.isnull(CentralAir):
        return 'NCentAir'    
    else:
        if CentralAir == 'Y':
            return 'CentAir'
        else:
            return 'NCentAir'

df['CentralAir'] = df[['Id','CentralAir']].apply(impute_CentralAir,axis=1)

### e. Electrical (CAT)

In [22]:
def impute_Electrical(cols):
    Id = cols[0]
    Electrical = cols[1]
    
    if pd.isnull(Electrical):
        return 'OthElectr'
    else:
        if Electrical == 'SBrkr':
            return 'StdCBrkr'
        else:
            return 'OthElectr'
        
df['Electrical'] = df[['Id','Electrical']].apply(impute_Electrical,axis=1)

# HOUSE INTERIOR

## 1. Assessment

### a. OverallQual (CAT)

In [23]:
def impute_OverallQual(cols):
    Id = cols[0]
    OverallQual = cols[1]
    
    if pd.isnull(OverallQual):
        return 'BAOAQual'
    else:
        if OverallQual in (6,7,8,9,10):
            return 'AAOAQual'
        elif OverallQual == 5:
            return 'AOAQual'
        else:
            return 'BAOAQual'
        
df['OverallQual'] = df[['Id','OverallQual']].apply(impute_OverallQual,axis=1)

### b. OverallCond (CAT)

In [24]:
def impute_OverallCond(cols):
    Id = cols[0]
    OverallCond = cols[1]
    
    if pd.isnull(OverallCond):
        return 'BAOACond'
    else:
        if OverallCond in (6,7,8,9,10):
            return 'AAOACond'
        elif OverallCond == 5:
            return 'AOACond'
        else:
            return 'BAOACond'
        
df['OverallCond'] = df[['Id','OverallCond']].apply(impute_OverallCond,axis=1)

### c. Functional (CAT)

In [25]:
def impute_Functional(cols):
    Id = cols[0]
    Functional = cols[1]
    
    if pd.isnull(Functional):
        return 'NTypFunc'
    else:
        if Functional == 'Typ':
            return 'TypFunc'
        else:
            return 'NTypFunc'
        
df['Functional'] = df[['Id','Functional']].apply(impute_Functional,axis=1)

## 2. Basement

### a. BsmtQual (CAT)

In [26]:
def impute_BsmtQual(cols):
    Id = cols[0]
    BsmtQual = cols[1]
    
    if pd.isnull(BsmtQual):
        return 'No-AvBsmtQ'
    else:
        if BsmtQual in ('Ex','Gd'):
            return 'GoodBsmtQ'
        elif BsmtQual in ('Fa','Po','NA'):
            return 'No-AvBsmtQ'
        else: 
            return 'No-AvBsmtQ'
        
df['BsmtQual'] = df[['Id','BsmtQual']].apply(impute_BsmtQual,axis=1)

### b. BsmtCond (CAT)

In [27]:
def impute_BsmtCond(cols):
    Id = cols[0]
    BsmtCond = cols[1]
    
    if pd.isnull(BsmtCond):
        return '<AvBsmtCond'
    else:
        if BsmtCond in ('Ex','Gd','TA'):
            return '>=AvBsmtCond'
        else:
            return '<AvBsmtCond'
        
df['BsmtCond'] = df[['Id','BsmtCond']].apply(impute_BsmtCond,axis=1)

### c. BsmtExposure (CAT)

In [28]:
def impute_949(cols):
    Id = cols[0]
    BsmtExposure = cols[1]
    
    if pd.isnull(BsmtExposure) & (Id == 949):
            return 'No'
    else:
        return BsmtExposure
        
df['BsmtExposure'] = df[['Id','BsmtExposure']].apply(impute_949,axis=1)

In [29]:
def impute_BsmtExposure(cols):
    Id = cols[0]
    BsmtExposure = cols[1]
    
    if pd.isnull(BsmtExposure):
        return 'NoBsmtExpo'
    else:
        if BsmtExposure == 'Mn': 
            return 'MinBsmtExpo'  # Observations with NA are recoded as No basement exposure
        elif BsmtExposure == 'Av':
            return 'AvBsmtExpo'
        elif BsmtExposure == 'Gd':
            return 'GoodBsmtExpo'
        else:
            return 'NoBsmtExpo'
        
df['BsmtExposure'] = df[['Id','BsmtExposure']].apply(impute_BsmtExposure,axis=1)

### d. BsmtFinType1 (CAT)

In [30]:
def impute_BsmtFinT1(cols):
    Id = cols[0]
    BsmtFinType1 = cols[1]
    
    if pd.isnull(BsmtFinType1):
        return 'BdBsmtFinT1'
    else:
        if BsmtFinType1 in ('GLQ'):
            return 'GdBsmtFinT1'
        elif BsmtFinType1 in ('ALQ','Rec'):
            return 'AvBsmtFinT1'
        elif BsmtFinType1 in ('BLQ','LwQ','Unf','NA'):            
            return 'BdBsmtFinT1'
        
df['BsmtFinType1'] = df[['Id','BsmtFinType1']].apply(impute_BsmtFinT1,axis=1)

### e. BsmtFinSF1

In [31]:
"""
#Exclude for now becaue many zeroes
def impute_BsmtFinSF1(cols):
    Id = cols[0]
    BsmtFinSF1 = cols[1]
    
    if pd.isnull(BsmtFinSF1):
        return 652 # Impute missing observation using the mean of the non-zero observations in training set
    else:
        return BsmtFinSF1

df['BsmtFinSF1'] = df[['Id','BsmtFinSF1']].apply(impute_BsmtFinSF1,axis=1)
"""

"\n#Exclude for now becaue many zeroes\ndef impute_BsmtFinSF1(cols):\n    Id = cols[0]\n    BsmtFinSF1 = cols[1]\n    \n    if pd.isnull(BsmtFinSF1):\n        return 652 # Impute missing observation using the mean of the non-zero observations in training set\n    else:\n        return BsmtFinSF1\n\ndf['BsmtFinSF1'] = df[['Id','BsmtFinSF1']].apply(impute_BsmtFinSF1,axis=1)\n"

In [32]:
# Make sure to delete:
df.drop('BsmtFinSF1',axis=1,inplace=True)

### f. BsmtFinType2 (CAT)

In [33]:
def impute_333(cols):
    Id = cols[0]
    BsmtFinType2 = cols[1]
    
    if pd.isnull(BsmtFinType2) and (Id == 333):
        return 'NA' #should be Unf but Unf is recoded as Bd
    else:
        return BsmtFinType2
        
df['BsmtFinType2'] = df[['Id','BsmtFinType2']].apply(impute_333,axis=1)

In [34]:
def impute_BsmtFinType2(cols):
    Id = cols[0]
    BsmtFinType2 = cols[1]
    
    if pd.isnull(BsmtFinType2):
        return 'Unf-BdBsmtFinT2' 
    else:
        if BsmtFinType2 in ('GLQ','ALQ','Rec'):
            return '>=AvBsmtFinT2'
        else:            
            return 'Unf-BdBsmtFinT2'
        
df['BsmtFinType2'] = df[['Id','BsmtFinType2']].apply(impute_BsmtFinType2,axis=1)

### g. BsmtFinSF2

In [35]:
# Make sure to delete:
df.drop('BsmtFinSF2',axis=1,inplace=True)

**Exclude and use BsmtUnfSF instead because BsmtFinSF2 has many zeroes.**

### h. BsmtUnfSF

In [36]:
df.drop('BsmtUnfSF',axis=1,inplace=True)

### i. TotalBsmtSF (CAT)

In [37]:
def impute_TotalBsmtSF(cols):
    Id = cols[0]
    TotalBsmtSF = cols[1]
    
    if pd.isnull(TotalBsmtSF):
        return 'Bsmt:<800SF' # Impute missing as mean of non-zero observations in training set
    else:
        if TotalBsmtSF < 800: 
            return 'Bsmt:<800SF'
        elif (TotalBsmtSF >= 800) & (TotalBsmtSF < 1000):
            return 'Bsmt:800-<1000SF'
        elif (TotalBsmtSF >= 1000) & (TotalBsmtSF < 1300):
            return 'Bsmt:1000-<1300SF'
        else:
            return 'Bsmt:>1300SF'
        
df['TotalBsmtSF'] = df[['Id','TotalBsmtSF']].apply(impute_TotalBsmtSF,axis=1)

## 3. Floor

### a. 1stFlrSF and 2ndFlrSF (CONT; new feature, 'TotFlrSF')

In [38]:
def impute_1stFlrSF(cols):
    Id = cols[0]
    _1stFlrSF = cols[1]
    
    if pd.isnull(_1stFlrSF):
        return 1163 # Impute missing as mean of non-zero observations in training set
    else:
        return _1stFlrSF
        
df['1stFlrSF'] = df[['Id','1stFlrSF']].apply(impute_1stFlrSF,axis=1)

In [39]:
def impute_2ndFlrSF(cols):
    Id = cols[0]
    _2ndFlrSF = cols[1]
    
    if pd.isnull(_2ndFlrSF):
        return 803 # Impute missing as mean of non-zero observations in training set
    else:
        return _2ndFlrSF
        
df['2ndFlrSF'] = df[['Id','2ndFlrSF']].apply(impute_2ndFlrSF,axis=1)

In [40]:
def impute_TotFlrSF(cols):
    Id = cols[0]
    _1stFlrSF = cols[1]
    _2ndFlrSF = cols[2]
    
    TotFlrSF = _1stFlrSF + _2ndFlrSF
    
    return TotFlrSF
        
df['TotFlrSF'] = df[['Id','1stFlrSF','2ndFlrSF']].apply(impute_TotFlrSF,axis=1)

In [41]:
# Make sure to drop original features:
df.drop(['1stFlrSF','2ndFlrSF'],axis=1,inplace=True)

### b. LowQualFinSF

In [42]:
df.drop('LowQualFinSF',axis=1,inplace=True)

### c. GrLivArea (CONT)

In [43]:
def impute_GrLivArea(cols):
    Id = cols[0]
    GrLivArea = cols[1]
    
    if pd.isnull(GrLivArea):
        return 1515 # Impute missing as mean of non-zero observations in training set
    else:
        return GrLivArea
        
df['GrLivArea'] = df[['Id','GrLivArea']].apply(impute_GrLivArea,axis=1)

### d. TotRmsAbvGrd (CONT)

In [44]:
def impute_TotRmsAbvGrd(cols):
    Id = cols[0]
    TotRmsAbvGrd = cols[1]
    
    if pd.isnull(TotRmsAbvGrd):
        return 6.5 # Impute missing as mean of non-zero observations in training set
    else:
        return TotRmsAbvGrd
        
df['TotRmsAbvGrd'] = df[['Id','TotRmsAbvGrd']].apply(impute_TotRmsAbvGrd,axis=1)

## 4. Bathrooms (CAT; new feature, 'NumBaths')

In [45]:
def impute_BsmtFullBath(cols):
    Id = cols[0]
    BsmtFullBath = cols[1]
    
    if pd.isnull(BsmtFullBath):
        return 0 # Impute missing as mean of non-zero observations in training set
    else:
        return BsmtFullBath
        
df['BsmtFullBath'] = df[['Id','BsmtFullBath']].apply(impute_BsmtFullBath,axis=1)

In [46]:
def impute_BsmtHalfBath(cols):
    Id = cols[0]
    BsmtHalfBath = cols[1]
    
    if pd.isnull(BsmtHalfBath):
        return 0 # Impute missing as mean of non-zero observations in training set
    else:
        return BsmtHalfBath
        
df['BsmtHalfBath'] = df[['Id','BsmtHalfBath']].apply(impute_BsmtHalfBath,axis=1)

In [47]:
def impute_FullBath(cols):
    Id = cols[0]
    FullBath = cols[1]
    
    if pd.isnull(FullBath):
        return 2 # Impute missing as mean of non-zero observations in training set
    else:
        return FullBath
        
df['FullBath'] = df[['Id','FullBath']].apply(impute_FullBath,axis=1)

In [48]:
def impute_HalfBath(cols):
    Id = cols[0]
    HalfBath = cols[1]
    
    if pd.isnull(HalfBath):
        return 0 # Impute missing as mean of non-zero observations in training set
    else:
        return HalfBath
        
df['HalfBath'] = df[['Id','HalfBath']].apply(impute_HalfBath,axis=1)

In [49]:
def impute_Baths(cols):
    Id = cols[0]
    BsmtFullBath = cols[1]
    BsmtHalfBath = cols[2]
    FullBath = cols[3]
    HalfBath = cols[4]
    
    total = BsmtFullBath + 0.5*BsmtHalfBath + FullBath + 0.5*HalfBath
    
    if total <= 1.5:
        return '1-1.5 Bath'
    elif total == 2:
        return '2 Bath'
    elif total == 2.5:
        return '2.5 Bath'
    elif total>2.5:
        return '3+ Bath'
        
df['NumBaths'] = df[['Id','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath']].apply(impute_Baths,axis=1)

In [50]:
# Make sure to delete original features:
df.drop(['BsmtFullBath','BsmtHalfBath','FullBath','HalfBath'],axis=1,inplace=True)

## 5. Bedrooms (CAT)

In [51]:
def impute_BedroomAbvGr(cols):
    Id = cols[0]
    BedroomAbvGr = cols[1]
    
    if pd.isnull(BedroomAbvGr):
        return '<3 Bedr'
    else:
        if BedroomAbvGr < 3:
            return '<3 Bedr'
        elif BedroomAbvGr == 3:
            return '=3 Bedr'
        else:
            return '>3 Bedr'
        
df['BedroomAbvGr'] = df[['Id','BedroomAbvGr']].apply(impute_BedroomAbvGr,axis=1)

## 6. Kitchen

### a. KitchenAbvGr (CAT)

In [52]:
def impute_KitchenAbvGr(cols):
    Id = cols[0]
    KitchenAbvGr = cols[1]
    
    if pd.isnull(KitchenAbvGr):
        return '1Ktchn' # Missing observations are coded as 'One' kitchen
    else:
        if KitchenAbvGr == 0:
            return '1Ktchn'
        elif KitchenAbvGr == 1:
            return '1Ktchn'
        else:
            return '>1Ktchn'
        
df['KitchenAbvGr'] = df[['Id','KitchenAbvGr']].apply(impute_KitchenAbvGr,axis=1)

### b. KitchenQual (CAT)

In [53]:
def impute_KitchenQual(cols):
    Id = cols[0]
    KitchenQual = cols[1]
    
    if pd.isnull(KitchenQual):
        return 'AvKtchnQ' # Missing observations are coded as 'Typical/Average'
    else:
        if KitchenQual in ('Fa','Po','TA'):
            return 'AvKtchnQ'
        elif KitchenQual == 'Gd':
            return 'GdKtchnQ'
        else:
            return 'ExKtchnQ'
        
df['KitchenQual'] = df[['Id','KitchenQual']].apply(impute_KitchenQual,axis=1)

## 7. Fireplace

### a. Fireplaces (CAT)

In [54]:
def impute_Fireplaces(cols):
    Id = cols[0]
    Fireplaces = cols[1]
    
    if pd.isnull(Fireplaces):
        return 'NoFirePl' 
    else:
        if Fireplaces == 0:
            return 'NoFirePl'
        elif Fireplaces == 1:
            return '1FirePl'
        else:
            return '>1FirePl'
        
df['Fireplaces'] = df[['Id','Fireplaces']].apply(impute_Fireplaces,axis=1)

### b. FireplaceQu (CAT)

In [55]:
def impute_FireplaceQu(cols):
    Id = cols[0]
    FireplaceQu = cols[1]
    
    if pd.isnull(FireplaceQu):
        return 'No-BdFirePlQ'
    else:
        if FireplaceQu in ('Ex','Gd'):
            return 'GdFirePlQ'
        elif FireplaceQu in ('Fa','Po'):
            return 'No-BdFirePlQ'
        elif FireplaceQu == 'NA':
            return 'No-BdFirePlQ'
        else:            
            return 'AvFirePlQ'
        
df['FireplaceQu'] = df[['Id','FireplaceQu']].apply(impute_FireplaceQu,axis=1)

## 8. Garage

### a. GarageArea

In [56]:
df.drop('GarageArea',axis=1,inplace=True)

### b. GarageType (CAT)

In [57]:
def impute_GarageType(cols):
    Id = cols[0]
    GarageType = cols[1]
    
    if pd.isnull(GarageType):
        return 'No-OthGrgT'
    else:
        if GarageType == 'Attchd':
            return 'AttGrgT'
        elif GarageType == 'Detchd':
            return 'DetGrgT'
        else:            
            return 'No-OthGrgT'
        
df['GarageType'] = df[['Id','GarageType']].apply(impute_GarageType,axis=1)

### c. GarageYrBlt

In [58]:
df.drop('GarageYrBlt',axis=1,inplace=True)

### d. GarageFinish (CAT)

In [59]:
def impute_GarageFinish(cols):
    Id = cols[0]
    GarageFinish = cols[1]
    
    if pd.isnull(GarageFinish):
        return 'No-UnfGrgF'
    else:
        if GarageFinish == 'Fin': 
            return 'FinGrgF'
        elif GarageFinish == 'RFn':
            return 'RghGrgF'
        else:            
            return 'No-UnfGrgF'
        
df['GarageFinish'] = df[['Id','GarageFinish']].apply(impute_GarageFinish,axis=1)

### e. GarageCars (CAT)

In [60]:
def impute_GarageCars(cols):
    Id = cols[0]
    GarageCars = cols[1]
    
    if pd.isnull(GarageCars):
        return 'NoCar'
    else:
        if GarageCars == 0:
            return 'NoCar'
        elif GarageCars == 1:
            return '1Car'
        elif GarageCars == 2:
            return '2Car'
        else:            
            return '>2Car'
        
df['GarageCars'] = df[['Id','GarageCars']].apply(impute_GarageCars,axis=1)

### f. GarageQual (CAT)

In [61]:
def impute_GarageQual(cols):
    Id = cols[0]
    GarageQual = cols[1]
    
    if pd.isnull(GarageQual):
        return 'No-<AveGrgQ'
    else:
        if GarageQual in ('TA','Gd','Ex'):
            return '>=AvGrgQ'
        else:
            return 'No-<AveGrgQ'
        
df['GarageQual'] = df[['Id','GarageQual']].apply(impute_GarageQual,axis=1)

### g. GarageCond (CAT)

In [62]:
def impute_GarageCond(cols):
    Id = cols[0]
    GarageCond = cols[1]
    
    if pd.isnull(GarageCond):
        return 'No-<AvGrgC'
    else:
        if GarageCond in ('TA','Gd','Ex'):
            return '>=AvGrgC'
        else:
            return 'No-<AvGrgC'
        
df['GarageCond'] = df[['Id','GarageCond']].apply(impute_GarageCond,axis=1)

### h. PavedDrive (CAT)

In [63]:
def impute_PavedDrive(cols):
    Id = cols[0]
    PavedDrive = cols[1]
    
    if pd.isnull(PavedDrive):
        return 'NPaveDr'
    else:
        if PavedDrive =='Y':
            return 'PaveDr'
        else:
            return 'NPaveDr'
        
df['PavedDrive'] = df[['Id','PavedDrive']].apply(impute_PavedDrive,axis=1)

# HOUSE EXTERIOR

## 1. General

### a. RoofStyle (CAT)

In [64]:
def impute_RoofStyle(cols):
    Id = cols[0]
    RoofStyle = cols[1]
    
    if pd.isnull(RoofStyle):
        return 'OthRoofS'
    else:
        if RoofStyle == 'Gable':
            return 'GblRoofS'
        else:
            return 'OthRoofS'
        
df['RoofStyle'] = df[['Id','RoofStyle']].apply(impute_RoofStyle,axis=1)

### b. RoofMat1

In [65]:
# Make sure to exclude feature:
df.drop('RoofMatl',axis=1,inplace=True)

### c. Exterior1st (CAT) and Exterior2nd

In [66]:
def impute_Exterior1st(cols):
    Id = cols[0]
    Exterior1st = cols[1]
    
    if pd.isnull(Exterior1st):
        return 'Other'
    else:
        if Exterior1st not in ('VinylSd','MetalSd','HdBoard','Wd Sdng'):
            return 'OthExt1'
        else:
            return Exterior1st
        
df['Exterior1st'] = df[['Id','Exterior1st']].apply(impute_Exterior1st,axis=1)

In [67]:
df.drop('Exterior2nd',axis=1,inplace=True)

### c. MasVnrType (CAT) and MasVnrArea

In [68]:
def fix_MasVnrType1(cols):
    Id = cols[0]
    MasVnrType = cols[1]
    MasVnrArea = cols[2]
    
    if (MasVnrType != 'None') & (MasVnrArea == 0):
        return 'None'
    else:
        return MasVnrType
        
df['MasVnrType'] = df[['Id','MasVnrType','MasVnrArea']].apply(fix_MasVnrType1,axis=1)

In [69]:
def fix_MasVnrType2(cols):
    Id = cols[0]
    MasVnrType = cols[1]
    MasVnrArea = cols[2]
    
    if (MasVnrType == 'None') & (MasVnrArea > 0):
        return 0
    else:
        return MasVnrArea
        
df['MasVnrArea'] = df[['Id','MasVnrType','MasVnrArea']].apply(fix_MasVnrType2,axis=1)

In [70]:
def impute_MasVnrType(cols):
    Id = cols[0]
    MasVnrType = cols[1]
    
    if pd.isnull(MasVnrType):
        return 'NoneMVT'
    else:
        if MasVnrType in ('BrkFace','BrkCmn'):
            return 'BrkCFMVT'
        elif MasVnrType == 'Stone':
            return 'StoneMVT'
        else:            
            return 'NoneMVT'
        
df['MasVnrType'] = df[['Id','MasVnrType']].apply(impute_MasVnrType,axis=1)

In [71]:
"""
#Exclude for now because of too many zeroes.
def impute_MasVnrArea(cols):
    Id = cols[0]
    MasVnrArea = cols[1]
    
    if pd.isnull(MasVnrArea):
        return 255 # Impute missing as mean of non-zero observations in training set
    else:
        return MasVnrArea
        
df['MasVnrArea'] = df[['Id','MasVnrArea']].apply(impute_MasVnrArea,axis=1)
"""

"\n#Exclude for now because of too many zeroes.\ndef impute_MasVnrArea(cols):\n    Id = cols[0]\n    MasVnrArea = cols[1]\n    \n    if pd.isnull(MasVnrArea):\n        return 255 # Impute missing as mean of non-zero observations in training set\n    else:\n        return MasVnrArea\n        \ndf['MasVnrArea'] = df[['Id','MasVnrArea']].apply(impute_MasVnrArea,axis=1)\n"

In [72]:
df.drop('MasVnrArea',axis=1,inplace=True)

### d. ExterQual (CAT)

In [73]:
def impute_ExterQual(cols):
    Id = cols[0]
    ExterQual = cols[1]
    
    if pd.isnull(ExterQual):
        return 'AvExtQual'
    else:
        if ExterQual in ('Ex','Gd'):
            return 'GdExtQual'
        else:
            return 'AvExtQual'
        
df['ExterQual'] = df[['Id','ExterQual']].apply(impute_ExterQual,axis=1)

### e. ExterCond (CAT)

In [74]:
def impute_ExterCond(cols):
    Id = cols[0]
    ExterCond = cols[1]
    
    if pd.isnull(ExterCond):
        return 'AvExtCond'
    else:
        if ExterCond in ('Ex','Gd'):
            return 'GdExtCond'
        else:
            return 'AvExtCond'
        
df['ExterCond'] = df[['Id','ExterCond']].apply(impute_ExterCond,axis=1)

### f. Foundation (CAT)

In [75]:
def impute_Foundation(cols):
    Id = cols[0]
    Foundation = cols[1]
    
    if pd.isnull(Foundation):
        return 'OthFndtn'
    else:
        if Foundation in ('BrkTil','Slab','Stone','Wood'):
            return 'OthFndtn'
        else:
            return Foundation
        
df['Foundation'] = df[['Id','Foundation']].apply(impute_Foundation,axis=1)

## 2. Lot

### a. LotFrontage

In [76]:
df.drop('LotFrontage',axis=1,inplace=True)

### b. LotArea (CONT)

In [77]:
def impute_LotArea(cols):
    Id = cols[0]
    LotArea = cols[1]
    
    if pd.isnull(LotArea):
        return 10157 # Impute missing as mean of non-zero observations in training set
    else:
        return LotArea
        
df['LotArea'] = df[['Id','LotArea']].apply(impute_LotArea,axis=1)

### c. LotConfig (CAT)

In [78]:
def impute_LotConfig(cols):
    Id = cols[0]
    LotConfig = cols[1]
    
    if pd.isnull(LotConfig):
        return 'OthLotCnfg'
    else:
        if LotConfig in ('CulDSac','FR2','FR3'):
            return 'OthLotCnfg'
        else:
            return LotConfig
        
df['LotConfig'] = df[['Id','LotConfig']].apply(impute_LotConfig,axis=1)

### d. LandSlope (CAT)

In [79]:
def impute_LandSlope(cols):
    Id = cols[0]
    LandSlope = cols[1]
    
    if pd.isnull(LandSlope):
        return 'OthSlope'
    else:
        if LandSlope == 'Gtl':
            return 'GntlSlope'
        else:
            return 'OthSlope'
        
df['LandSlope'] = df[['Id','LandSlope']].apply(impute_LandSlope,axis=1)

### e. Street

In [80]:
df.drop('Street',axis=1,inplace=True)

### f. Alley

In [81]:
df.drop('Alley',axis=1,inplace=True)

### g. LotShape (CAT)

In [82]:
def impute_LotShape(cols):
    Id = cols[0]
    LotShape = cols[1]
    
    if pd.isnull(LotShape):
        return 'NRegShape' # Missing imputed as Irregular
    else:
        if LotShape =='Reg': 
            return 'RegShape'
        else:
            return 'NRegShape'
        
df['LotShape'] = df[['Id','LotShape']].apply(impute_LotShape,axis=1)

### h. LandContour (CAT)

In [83]:
def impute_LandContour(cols):
    Id = cols[0]
    LandContour = cols[1]
    
    if pd.isnull(LandContour):
        return 'NLevel'
    else:
        if LandContour == 'Lvl':
            return 'Level'
        else:
            return 'NLevel'
        
df['LandContour'] = df[['Id','LandContour']].apply(impute_LandContour,axis=1)

### i. Fence (CAT)

In [84]:
def impute_Fence(cols):
    Id = cols[0]
    Fence = cols[1]
    
    if pd.isnull(Fence):
        return 'NFence'
    elif Fence in ('MnPrv','GdPrv','GdWo','MnWw'):
        return 'Fence'
    else:
        return 'NFence'
        
df['Fence'] = df[['Id','Fence']].apply(impute_Fence,axis=1)

## 3. Deck

### a. WoodDeckSF (CAT; continuous into categorical)

In [85]:
def impute_WoodDeck(cols):
    Id = cols[0]
    WoodDeckSF = cols[1]
    
    if WoodDeckSF == 0:
        return 'NWoodDeck'
    else:
        return 'WoodDeck'
        
df['WoodDeckSF'] = df[['Id','WoodDeckSF']].apply(impute_WoodDeck,axis=1)

### b. OpenPorchSF (CAT; continuous to categorical)

In [86]:
def impute_OpenPorch(cols):
    Id = cols[0]
    OpenPorchSF = cols[1]
    
    if OpenPorchSF == 0:
        return 'NOpenPorch'
    else:
        return 'OpenPorch'
        
df['OpenPorchSF'] = df[['Id','OpenPorchSF']].apply(impute_OpenPorch,axis=1)

### c. EnclosedPorch (CAT; continuous to categorical)

In [87]:
def impute_EnclosedPorch(cols):
    Id = cols[0]
    EnclosedPorch = cols[1]
    
    if EnclosedPorch == 0:
        return 'NEnclsdPorch'
    else:
        return 'EnclsdPorch'
        
df['EnclosedPorch'] = df[['Id','EnclosedPorch']].apply(impute_EnclosedPorch,axis=1)

### d. 3SsnPorch

In [88]:
df.drop('3SsnPorch',axis=1,inplace=True)

### e. ScreenPorch (CAT; continuous to categorical)

In [89]:
def impute_ScreenPorch(cols):
    Id = cols[0]
    ScreenPorch = cols[1]
    
    if ScreenPorch == 0:
        return 'NScrnPorch'
    else:
        return 'ScrnPorch'
        
df['ScreenPorch'] = df[['Id','ScreenPorch']].apply(impute_ScreenPorch,axis=1)

## 4. Pool

### a. PoolArea

In [90]:
df.drop('PoolArea',axis=1,inplace=True)

### b. PoolQC

In [91]:
df.drop('PoolQC',axis=1,inplace=True)

## 5. Other

### a. MiscFeature

### b. MiscVal

In [92]:
df.drop(['MiscFeature','MiscVal'],axis=1,inplace=True)

# SAVE DATA TO DIRECTORY

In [93]:
#df.to_pickle('train_select')
df.to_pickle('./Data/train_select')