In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import GradientBoostingRegressor

pd.options.display.max_rows = 20
pd.options.display.max_columns = 100

train_data_file = '../input/train.csv'
type_dictionary = '../input/data_types.csv'
test_data_file = '../input/test.csv'

In [2]:
type_dict = {}
with open(type_dictionary) as f:
    reader = csv.reader(f)
    for row in reader:
        type_dict[row[0]] = row[1]

In [3]:
train = pd.read_csv(train_data_file, dtype=type_dict)
train_raw = train.copy()
train_id = train.Id
train.drop(columns='Id', inplace=True)

In [4]:
test = pd.read_csv(test_data_file, dtype=type_dict)
test_raw = test.copy()
test_id = test.Id
test.drop(columns='Id', inplace=True)

In [5]:
def fill_missing_data(dfin):
    df = dfin.copy()
    categoricals = ['PoolQC','MiscFeature','Alley','Fence','FireplaceQu',
                    'GarageCond','GarageType','GarageFinish','GarageQual',
                   'BsmtExposure','BsmtFinType2','BsmtFinType1','BsmtCond','BsmtQual']
    for column in categoricals:
        df.loc[:,column].cat.add_categories(['None'], inplace=True)
        df.loc[:,column].fillna('None', inplace=True)
    df.MasVnrType.fillna('None', inplace=True)
    
    floats = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath','BsmtUnfSF',
              'GarageArea', 'GarageCars', 'TotalBsmtSF','MasVnrArea']
    for column in floats:
        df.loc[:,column].fillna(0, inplace=True)
        
    df.Exterior1st.fillna('VinylSd', inplace=True)
    df.Exterior2nd.fillna('VinylSd', inplace=True)
    df.Functional.fillna('Typ', inplace=True)
    df.KitchenQual.fillna('TA', inplace=True)
    df.MSZoning.fillna('RL', inplace=True)
    df.SaleType.fillna('WD', inplace=True)
    df.Utilities.fillna('AllPub', inplace=True)
    df.Electrical.fillna('SBrkr', inplace=True)
    
    df.LotFrontage.fillna(df.LotFrontage.median(), inplace=True)
    df.GarageYrBlt.fillna(9999, inplace=True)
    
    return df

In [6]:
def clean_categoricals(dfin):
    df = dfin.copy()
    
    #This will set the proper order for all of the ordinal variables
    df.BsmtCond.cat.set_categories(['None','Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.BsmtExposure.cat.set_categories(['None','No','Mn','Av','Gd'], ordered=True, inplace=True)
    df.BsmtFinType1.cat.set_categories(['None','Unf','LwQ','Rec','BLQ','ALQ','GLQ'], ordered=True, inplace=True)
    df.BsmtFinType2.cat.set_categories(['None','Unf','LwQ','Rec','BLQ','ALQ','GLQ'], ordered=True, inplace=True)
    df.BsmtQual.cat.set_categories(['None','Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.Electrical.cat.set_categories(['Mix','FuseP','FuseF','FuseA','SBrkr'], ordered=True, inplace=True)
    df.ExterCond.cat.set_categories(['Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.ExterQual.cat.set_categories(['Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.Fence.cat.set_categories(['None','MnWw','GdWo','MnPrv','GdPrv'], ordered=True, inplace=True)
    df.FireplaceQu.cat.set_categories(['None','Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.Functional.cat.set_categories(['Sal','Sev','Maj2','Maj1','Mod','Min2','Min1','Typ'], ordered=True, inplace=True)
    df.GarageCond.cat.set_categories(['None','Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.GarageFinish.cat.set_categories(['None','Unf','RFn','Fin'], ordered=True, inplace=True)
    df.GarageQual.cat.set_categories(['None','Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.HeatingQC.cat.set_categories(['Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.KitchenQual.cat.set_categories(['Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.LandSlope.cat.set_categories(['Sev','Mod','Gtl'], ordered=True, inplace=True)
    df.LotShape.cat.set_categories(['Reg','IR1','IR2','IR3'], ordered=True, inplace=True)
    df.OverallCond.cat.set_categories(['1','2','3','4','5','6','7','8','9','10'], ordered=True, inplace=True)
    df.OverallQual.cat.set_categories(['1','2','3','4','5','6','7','8','9','10'], ordered=True, inplace=True)
    df.PavedDrive.cat.set_categories(['N','P','Y'], ordered=True, inplace=True)
    df.PoolQC.cat.set_categories(['None','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.Utilities.cat.set_categories(['ELO','NoSeWa','NoSewr','AllPub'], ordered=True, inplace=True)
    
    # This will set which categories from the nominal variables will be used (NaNs are created)
    df.MSSubClass.cat.set_categories(['20','50','60','Other'], inplace=True)
    df.MSZoning.cat.set_categories(['RL','RM','Other'], inplace=True)
    df.LandContour.cat.set_categories(['Lvl','Other'], inplace=True)
    df.LotConfig.cat.set_categories(['Inside','Other'], inplace=True)
    df.Condition1.cat.set_categories(['Artery', 'Feedr', 'Norm', 'PosA', 'PosN','Other'], inplace=True)
    df.Condition2.cat.set_categories(['Norm','Other'], inplace=True)
    df.BldgType.cat.set_categories(['1Fam','Other'], inplace=True)
    df.HouseStyle.cat.set_categories(['1.5Fin', '1Story', '2Story', 'SLvl', 'Other'], inplace=True)
    df.RoofStyle.cat.set_categories(['Gable','Flat','Hip','Other'], inplace=True)
    df.RoofMatl.cat.set_categories(['CompShg','Other'], inplace=True)
    df.Exterior1st.cat.set_categories(['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd',
       'HdBoard', 'ImStucc', 'MetalSd', 'Plywood', 'Stone', 'Stucco',
       'VinylSd', 'Wd Sdng', 'WdShing'], inplace=True)
    df.Exterior2nd.cat.set_categories(['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd',
       'HdBoard', 'ImStucc', 'MetalSd', 'Plywood', 'Stone', 'Stucco',
       'VinylSd', 'Wd Sdng', 'WdShing'], inplace=True)
    df.Heating.cat.set_categories(['Floor', 'GasA', 'Other'], inplace=True)
    df.GarageType.cat.set_categories(['Attchd','Detchd','None','Other'], inplace=True)
    df.SaleCondition.cat.set_categories(['Normal','Other'], inplace=True)
    
    #This fills the NaNs into the recently created 'Other' category
    others = ['MSSubClass', 'MSZoning', 'LandContour', 'LotConfig', 'Condition1',
              'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
              'Heating', 'GarageType', 'SaleCondition']
    for col in others:
        df[col].fillna('Other', inplace=True)
    
    return df

In [7]:
def drop_features(dfin):
    df = dfin.copy()
    columns_to_drop = ['MiscFeature','MiscVal','SaleType','RoofMatl','YearRemodAdd']
    return df.drop(columns_to_drop, axis=1)

In [8]:
def apply_power_transform(dfin):
    df = dfin.copy()
    pt = PowerTransformer(copy=False)
    floats = df.dtypes=='float64'
    data = df.loc[:,floats].values
    pt.fit_transform(data)
    transformed_data = pd.DataFrame(data, columns=df.loc[:,floats].columns)
    for col in transformed_data.columns:
        df[col] = transformed_data[col]
    return df

In [9]:
def all_dummies(df):
    dummies = df.copy()
    ordinals = ['BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtQual',
               'Electrical','ExterCond','ExterQual','Fence','FireplaceQu',
               'Functional','GarageCond','GarageFinish','GarageQual','HeatingQC',
               'KitchenQual','LandSlope','LotShape','OverallCond','OverallQual',
               'PavedDrive','PoolQC','Utilities']
    for col in ordinals:
        dummies[col] = dummies[col].cat.codes
    return pd.get_dummies(dummies, drop_first=True)

In [10]:
def transform_data(dfin):
    df = dfin.copy()
    df_fill = fill_missing_data(df)
    df_ordered = clean_categoricals(df_fill)
    df_drop = drop_features(df_ordered)
    df_power_transform = apply_power_transform(df_drop)
    df_dummies = all_dummies(df_power_transform)
    
    return df_dummies

In [11]:
def fit_predict(regressor, X_train, Y_train, X_test, rname):
    regressor.fit(X_train, Y_train)
    Y_pred = regressor.predict(X_test)
    submission = pd.DataFrame({'Id':test_id.astype(int), 'SalePrice':Y_pred})
    submission.to_csv("../output/" + rname + ".csv", index=False)

In [12]:
X_train = train.iloc[:,:-1]
Y_train = train.iloc[:,-1]
X_train_clean = transform_data(X_train)
test_clean = transform_data(test)

In [13]:
grad_boost_ls = GradientBoostingRegressor(loss='ls', n_estimators=500)
fit_predict(grad_boost_ls, X_train_clean, Y_train, test_clean, "f_grad_boost_ls_500")

In [14]:
features = pd.DataFrame({'Feature':X_train_clean.columns, 'Importance':grad_boost_ls.feature_importances_})
features.set_index('Feature', inplace=True)
features.sort_index(inplace=True)

In [15]:
features.loc[features.Importance>0,:].sort_values(by='Importance', ascending=False)

Unnamed: 0_level_0,Importance
Feature,Unnamed: 1_level_1
OverallQual,4.828121e-01
GrLivArea,1.219140e-01
TotalBsmtSF,5.681090e-02
GarageCars,4.577087e-02
ExterQual,3.974931e-02
2ndFlrSF,3.715006e-02
BsmtFinSF1,3.610363e-02
KitchenQual,2.437392e-02
1stFlrSF,1.853468e-02
LotArea,1.726691e-02


In [16]:
features.loc[features.Importance==0,:].index

Index(['Alley_None', 'Exterior1st_CBlock', 'Exterior1st_ImStucc',
       'Exterior2nd_BrkComm', 'Exterior2nd_CBlock', 'Exterior2nd_CemntBd',
       'Exterior2nd_ImStucc', 'Exterior2nd_Stone', 'Exterior2nd_WdShing',
       'Foundation_Slab', 'Foundation_Stone', 'GarageType_None',
       'Neighborhood_Blueste', 'Neighborhood_BrDale', 'Neighborhood_NPkVill'],
      dtype='object', name='Feature')