In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import csv

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor


pd.options.display.max_rows = 20
pd.options.display.max_columns = 100

train_data_file = '../input/train.csv'
type_dictionary = '../input/data_types.csv'
test_data_file = '../input/test.csv'

In [2]:
type_dict = {}
with open(type_dictionary) as f:
    reader = csv.reader(f)
    for row in reader:
        type_dict[row[0]] = row[1]

In [3]:
train = pd.read_csv(train_data_file, dtype=type_dict)
train_raw = train.copy()
train_id = train.Id
train.drop(columns='Id', inplace=True)

In [4]:
test = pd.read_csv(test_data_file, dtype=type_dict)
test_raw = test.copy()
test_id = test.Id
test.drop(columns='Id', inplace=True)

In [5]:
def fill_missing_data(dfin):
    df = dfin.copy()
    categoricals = ['PoolQC','MiscFeature','Alley','Fence','FireplaceQu',
                    'GarageCond','GarageType','GarageFinish','GarageQual',
                   'BsmtExposure','BsmtFinType2','BsmtFinType1','BsmtCond','BsmtQual']
    for column in categoricals:
        df.loc[:,column].cat.add_categories(['None'], inplace=True)
        df.loc[:,column].fillna('None', inplace=True)
    df.MasVnrType.fillna('None', inplace=True)
    
    floats = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath','BsmtUnfSF',
              'GarageArea', 'GarageCars', 'TotalBsmtSF','MasVnrArea']
    for column in floats:
        df.loc[:,column].fillna(0, inplace=True)
        
    df.Exterior1st.fillna('VinylSd', inplace=True)
    df.Exterior2nd.fillna('VinylSd', inplace=True)
    df.Functional.fillna('Typ', inplace=True)
    df.KitchenQual.fillna('TA', inplace=True)
    df.MSZoning.fillna('RL', inplace=True)
    df.SaleType.fillna('WD', inplace=True)
    df.Utilities.fillna('AllPub', inplace=True)
    df.Electrical.fillna('SBrkr', inplace=True)
    
    df.LotFrontage.fillna(df.LotFrontage.median(), inplace=True)
    df.GarageYrBlt.fillna(9999, inplace=True)
    
    return df

In [6]:
def clean_categoricals(dfin):
    df = dfin.copy()
    df.BsmtCond.cat.set_categories(['None','Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.BsmtExposure.cat.set_categories(['None','No','Mn','Av','Gd'], ordered=True, inplace=True)
    df.BsmtFinType1.cat.set_categories(['None','Unf','LwQ','Rec','BLQ','ALQ','GLQ'], ordered=True, inplace=True)
    df.BsmtFinType2.cat.set_categories(['None','Unf','LwQ','Rec','BLQ','ALQ','GLQ'], ordered=True, inplace=True)
    df.BsmtQual.cat.set_categories(['None','Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.Electrical.cat.set_categories(['Mix','FuseP','FuseF','FuseA','SBrkr'], ordered=True, inplace=True)
    df.ExterCond.cat.set_categories(['Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.ExterQual.cat.set_categories(['Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.Fence.cat.set_categories(['None','MnWw','GdWo','MnPrv','GdPrv'], ordered=True, inplace=True)
    df.FireplaceQu.cat.set_categories(['None','Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.Functional.cat.set_categories(['Sal','Sev','Maj2','Maj1','Mod','Min2','Min1','Typ'], ordered=True, inplace=True)
    df.GarageCond.cat.set_categories(['None','Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.GarageFinish.cat.set_categories(['None','Unf','RFn','Fin'], ordered=True, inplace=True)
    df.GarageQual.cat.set_categories(['None','Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.HeatingQC.cat.set_categories(['Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.KitchenQual.cat.set_categories(['Po','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.LandSlope.cat.set_categories(['Sev','Mod','Gtl'], ordered=True, inplace=True)
    df.LotShape.cat.set_categories(['Reg','IR1','IR2','IR3'], ordered=True, inplace=True)
    df.OverallCond.cat.set_categories(['1','2','3','4','5','6','7','8','9','10'], ordered=True, inplace=True)
    df.OverallQual.cat.set_categories(['1','2','3','4','5','6','7','8','9','10'], ordered=True, inplace=True)
    df.PavedDrive.cat.set_categories(['N','P','Y'], ordered=True, inplace=True)
    df.PoolQC.cat.set_categories(['None','Fa','TA','Gd','Ex'], ordered=True, inplace=True)
    df.Utilities.cat.set_categories(['ELO','NoSeWa','NoSewr','AllPub'], ordered=True, inplace=True)
    
    df.Condition2.cat.set_categories(['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNn'], inplace=True)
    df.Exterior1st.cat.set_categories(['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd',
       'HdBoard', 'ImStucc', 'MetalSd', 'Plywood', 'Stone', 'Stucco',
       'VinylSd', 'Wd Sdng', 'WdShing'], inplace=True)
    df.Exterior2nd.cat.set_categories(['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd',
       'HdBoard', 'ImStucc', 'MetalSd', 'Plywood', 'Stone', 'Stucco',
       'VinylSd', 'Wd Sdng', 'WdShing'], inplace=True)
    df.Heating.cat.set_categories(['Floor', 'GasA', 'GasW', 'Grav', 'OthW', 'Wall'], inplace=True)
    df.HouseStyle.cat.set_categories(['1.5Fin', '1.5Unf', '1Story', '2.5Fin', '2.5Unf', '2Story', 'SFoyer',
       'SLvl'], inplace=True)
    df.MiscFeature.cat.set_categories(['Gar2', 'Othr', 'Shed', 'TenC'], inplace=True)
    df.RoofMatl.cat.set_categories(['ClyTile', 'CompShg', 'Membran', 'Metal', 'Roll', 'Tar&Grv', 'WdShake',
       'WdShngl'], inplace=True)
    df.MSSubClass.cat.set_categories(['120', '150', '160', '180', '190', '20', '30', '40', '45', '50', '60',
       '70', '75', '80', '85', '90'], inplace=True)
    
    return df

In [7]:
def all_dummies(df):
    dummies = df.copy()
    ordinals = ['BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtQual',
               'Electrical','ExterCond','ExterQual','Fence','FireplaceQu',
               'Functional','GarageCond','GarageFinish','GarageQual','HeatingQC',
               'KitchenQual','LandSlope','LotShape','OverallCond','OverallQual',
               'PavedDrive','PoolQC','Utilities']
    for col in ordinals:
        dummies.loc[:,col] = dummies.loc[:,col].cat.codes
    return pd.get_dummies(dummies, drop_first=True)

In [8]:
def transform_data(dfin):
    df = dfin.copy()
    df_fill = fill_missing_data(df)
    df_ordered = clean_categoricals(df_fill)
    df_dummies = all_dummies(df_ordered)
    
    return df_dummies

In [9]:
def fit_predict(regressor, X_train, Y_train, X_test, rname):
    regressor.fit(X_train, Y_train)
    Y_pred = regressor.predict(X_test)
    submission = pd.DataFrame({'Id':test_id.astype(int), 'SalePrice':Y_pred})
    submission.to_csv("../output/" + rname + ".csv", index=False)

In [10]:
X_train = train.iloc[:,:-1]
Y_train = train.iloc[:,-1]
X_train_clean = transform_data(X_train)
test_clean = transform_data(test)

In [11]:
lin_reg_all = LinearRegression()
fit_predict(lin_reg_all, X_train_clean, Y_train, test_clean, "lin_reg_all")

In [12]:
lin_reg_top = LinearRegression()
top_predictors = ["GrLivArea","GarageArea","FullBath","YearBuilt"]
fit_predict(lin_reg_top, X_train_clean[top_predictors], Y_train, test_clean[top_predictors], "lin_reg_top")

In [13]:
ridge_reg = Ridge()
fit_predict(ridge_reg, X_train_clean, Y_train, test_clean, "ridge_reg")

In [14]:
lasso_reg = Lasso(max_iter=1e5)
fit_predict(lasso_reg, X_train_clean, Y_train, test_clean, "lasso_reg")

In [15]:
elastic_net = ElasticNet(max_iter=1e6)
fit_predict(elastic_net, X_train_clean, Y_train, test_clean, "elsatic_net")

In [16]:
random_forest = RandomForestRegressor(n_estimators=100, n_jobs=-1)
fit_predict(random_forest, X_train_clean, Y_train, test_clean, "random_forest")

In [17]:
grad_boost_ls = GradientBoostingRegressor(loss='ls', n_estimators=200)
fit_predict(grad_boost_ls, X_train_clean, Y_train, test_clean, "grad_boost_ls")

In [18]:
features = pd.DataFrame({'Feature':X_train_clean.columns, 'Importance':grad_boost_ls.feature_importances_})

In [19]:
features.loc[features.Importance>0,:].sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
26,GrLivArea,0.078189
5,OverallQual,0.062102
19,BsmtUnfSF,0.053337
1,LotArea,0.046231
41,GarageArea,0.044488
20,TotalBsmtSF,0.043027
16,BsmtFinSF1,0.042382
6,OverallCond,0.034319
7,YearBuilt,0.032905
23,1stFlrSF,0.029153


In [20]:
grad_boost_lad = GradientBoostingRegressor(loss='lad', n_estimators=200)
fit_predict(grad_boost_lad, X_train_clean, Y_train, test_clean, "grad_boost_lad")

In [21]:
ada_boost = AdaBoostRegressor(n_estimators=200)
fit_predict(ada_boost, X_train_clean, Y_train, test_clean, "ada_boost")