In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
% matplotlib inline

In [3]:
test_df = pd.read_csv('../data/test.csv')
test_df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [4]:
test_df.shape

(879, 80)

In [5]:
test_df.rename(mapper=lambda x: x.lower().replace(' ', '_'), axis =1, inplace=True)

In [6]:
def series_of_null_features(df):
    return df.loc[:,df.isnull().sum() > 0].isnull().sum()

___
## Cleaning


### What to do with null values
 * Drop mas\_vnr\_*
 * Drop garage\_yr\_built
 * use median for lot\_frontage
 * All other floats use 0 for na
 * Ignore all other cases


In [7]:
series_of_null_features(test_df)

lot_frontage      160
alley             821
mas_vnr_type        1
mas_vnr_area        1
bsmt_qual          25
bsmt_cond          25
bsmt_exposure      25
bsmtfin_type_1     25
bsmtfin_type_2     25
electrical          1
fireplace_qu      422
garage_type        44
garage_yr_blt      45
garage_finish      45
garage_qual        45
garage_cond        45
pool_qc           875
fence             707
misc_feature      838
dtype: int64

In [8]:
# Check that the missing numeric features correspond to the indicator being NA
test_df[test_df.bsmt_exposure.isnull() & ~test_df.bsmt_cond.isnull()].loc[:,['bsmt_exposure', 'bsmt_cond', 'bsmt_qual', 'bsmtfin_type_1', 'bsmtfin_sf_1', 'bsmtfin_type_2','bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf', 'bsmt_full_bath', 'bsmt_half_bath']]

Unnamed: 0,bsmt_exposure,bsmt_cond,bsmt_qual,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,bsmt_full_bath,bsmt_half_bath


In [9]:
#So the bsmtfin_sf_* == nan's are all from 1 entry, so i'll impute 0 
test_df[test_df.bsmtfin_sf_1.isnull()].loc[:,['bsmt_exposure', 'bsmt_cond', 'bsmt_qual', 'bsmtfin_type_1', 'bsmtfin_sf_1', 'bsmtfin_type_2','bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf', 'bsmt_full_bath', 'bsmt_half_bath', 'bsmt_unf_sf', 'total_bsmt_sf']]

Unnamed: 0,bsmt_exposure,bsmt_cond,bsmt_qual,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,bsmt_full_bath,bsmt_half_bath,bsmt_unf_sf.1,total_bsmt_sf.1


In [10]:
#So the bsmtfin_sf_* == nan's are all from 1 entry, so i'll impute 0 
test_df[test_df.bsmt_full_bath.isnull()].loc[:,['bsmt_exposure', 'bsmt_cond', 'bsmt_qual', 'bsmtfin_type_1', 'bsmtfin_sf_1', 'bsmtfin_type_2','bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf', 'bsmt_full_bath', 'bsmt_half_bath']]

Unnamed: 0,bsmt_exposure,bsmt_cond,bsmt_qual,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,bsmt_full_bath,bsmt_half_bath


In [11]:
test_df.total_bsmt_sf.fillna(0, inplace=True)

In [12]:
test_df.bsmt_unf_sf.fillna(0, inplace=True)

In [13]:
test_df.bsmt_full_bath.fillna(0, inplace=True)

In [14]:
test_df.bsmt_half_bath.fillna(0, inplace=True)

In [15]:
test_df.bsmt_qual.fillna('NA', inplace=True)

In [16]:
test_df.bsmt_cond.fillna('NA', inplace=True)

In [17]:
test_df.bsmt_exposure.fillna('NA', inplace=True)

In [18]:
test_df.bsmtfin_sf_1.fillna(0, inplace=True)

In [19]:
test_df.bsmtfin_type_1.fillna('NA', inplace = True)

In [20]:
test_df.bsmtfin_sf_2.fillna(0, inplace=True)

In [21]:
test_df.bsmtfin_type_2.fillna('NA', inplace=True)

In [22]:
test_df[test_df.garage_finish.isnull() & ~test_df.garage_type.isnull()].loc[:, ['garage_cars','garage_area','garage_type', 'garage_finish', 'garage_qual', 'garage_cond']]

Unnamed: 0,garage_cars,garage_area,garage_type,garage_finish,garage_qual,garage_cond
765,1.0,360.0,Detchd,,,


I think I'm going to impute NaN for garage_type and then do the transformations on it
___

In [23]:
test_df.garage_area.fillna(0, inplace=True)

In [24]:
test_df.garage_cars.fillna(0, inplace=True)

In [25]:
test_df.garage_cond.fillna('NA', inplace=True)

In [26]:
test_df.garage_finish.fillna('NA', inplace=True)

In [27]:
test_df.garage_qual.fillna('NA', inplace=True)

In [28]:
test_df.garage_type.fillna('NA', inplace=True)

In [29]:
test_df.alley.fillna('NA', inplace=True)

In [30]:
test_df.lot_frontage.fillna(value=test_df.lot_frontage.median(), inplace=True)

In [31]:
test_df.drop(['mas_vnr_type', 'mas_vnr_area', 'garage_yr_blt'], axis = 1, inplace = True)

In [32]:
test_df.fireplace_qu.fillna('NA', inplace=True)

In [33]:
test_df.pool_qc.fillna('NA', inplace=True)

In [34]:
test_df.fence.fillna('NA', inplace=True)

In [35]:
test_df.misc_feature.fillna('NA', inplace=True)

In [39]:
test_df.electrical.fillna('SBrkr', inplace=True)

In [40]:
series_of_null_features(test_df)

Series([], dtype: float64)

# Exporting clean data

In [41]:
test_df.to_csv('../data/test_CLEAN.csv')
test_df.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,68.0,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,68.0,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


## So now there should be no null values

### Feature selection
 * Make dummy variables 
   * Remember that MSSubClass will have a value in the train data that wont be in the test data.  Making a dummy variable for this will give a different shape to our train data than to our test data
 * Use LassoCV to determine which coefs should be dropped (see kobe lab) 
 * drop those features manually before making the final model
 * We may want to use `np.log1p()` to remove the skew from the target columns. Don't forget to undo this before turning it in though with `np.expm1()`

In [42]:
test_df.ms_subclass = test_df.ms_subclass.astype('object')

In [43]:
test_df.set_index('id', inplace=True)

In [44]:
test_df.drop('pid', 1,inplace=True)

In [45]:
dum_test_ = pd.get_dummies(test_df, columns=['ms_subclass'], drop_first=True)
dum_test_.loc[:,['ms_subclass_80']].sum()

ms_subclass_80    32
dtype: int64

In [59]:
for col in test_df.select_dtypes(include='object').columns:
    print(col)

ms_subclass
ms_zoning
street
alley
lot_shape
land_contour
utilities
lot_config
land_slope
neighborhood
condition_1
condition_2
bldg_type
house_style
roof_style
roof_matl
exterior_1st
exterior_2nd
exter_qual
exter_cond
foundation
bsmt_qual
bsmt_cond
bsmt_exposure
bsmtfin_type_1
bsmtfin_type_2
heating
heating_qc
central_air
electrical
kitchen_qual
functional
fireplace_qu
garage_type
garage_finish
garage_qual
garage_cond
paved_drive
pool_qc
fence
misc_feature
sale_type


# Now Export the engineered data to use in the model making notebook

In [None]:
test_df.to_csv('../data/train_ENGINEERED.csv')
test_df.head()

## Now go make a model