## Import Data

In [1]:
# Setup Packages
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

%matplotlib inline

# Display all rows and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [32]:
house_data = pd.read_csv('train.csv')
# house_data = pd.read_csv('test.csv')

remove_outlier = 1  # change to 0 for test data

<IPython.core.display.Javascript object>

## Data Manipulation

### Response Variable - Log Transform
- Transformation validated in DataExploration Notebook

In [33]:
house_data['SalePrice_log'] = np.log(house_data['SalePrice'])

<IPython.core.display.Javascript object>

### Impute Continuous Variables
- Imputation with mean validated in DataExploration Notebook

In [35]:
house_data["LotFrontage"].mean()

70.04995836802642

<IPython.core.display.Javascript object>

In [36]:
# LotFrontage
avg_LotFrontage = house_data['LotFrontage'].mean()
house_data['LotFrontage'].fillna(avg_LotFrontage, inplace=True)

# MasVnrArea
avg_LotFrontage = house_data['MasVnrArea'].mean()
house_data['MasVnrArea'].fillna(avg_LotFrontage, inplace=True)


<IPython.core.display.Javascript object>

### Impute Categorical Variables

In [5]:
#Impute NA values with 'None'
house_data['BsmtQual'].fillna('None', inplace=True)
house_data['BsmtCond'].fillna('None', inplace=True)
house_data['BsmtExposure'].fillna('None', inplace=True)
house_data['BsmtFinType1'].fillna('None', inplace=True)
train_data_cat_var.at[948,'BsmtExposure']='No'

<IPython.core.display.Javascript object>

In [6]:
#Electrical Fill with Mode
house_data['Electrical'].fillna(house_data['Electrical'].mode()[0], inplace=True)

<IPython.core.display.Javascript object>

In [7]:
#Fire Place Quality
house_data['FireplaceQu'].fillna('None', inplace=True)

<IPython.core.display.Javascript object>

In [9]:
#Garage Related Variables
house_data['GarageType'].fillna('None', inplace=True)
house_data['GarageFinish'].fillna('None', inplace=True)
house_data['GarageQual'].fillna('None', inplace=True)

<IPython.core.display.Javascript object>

In [None]:
house_data['PoolQC'].fillna("None", inplace=True)

In [None]:
house_data['Fence'].fillna("None", inplace=True)

In [None]:
house_data['MiscFeature'].fillna("None", inplace=True)

## Feature Engineering

In [10]:
#Total area of house = Above ground living area + basement living area
house_data['TotalSF'] = house_data['GrLivArea'] + house_data['TotalBsmtSF']

<IPython.core.display.Javascript object>

In [11]:
#Age = Year sold - year built
house_data['Age'] = house_data['YrSold'] + house_data['YearBuilt']

<IPython.core.display.Javascript object>

In [12]:
#AgeRemod = Year sold - year remodeled
house_data['Age'] = house_data['YrSold'] + house_data['YearRemodAdd']

<IPython.core.display.Javascript object>

In [13]:
#Total porch SF = OpenPorchSF + EnclosedSF + 3SsnPorch + ScreenPorch
house_data['TotPorchSF'] = house_data['OpenPorchSF'] + house_data['EnclosedPorch'] + house_data['3SsnPorch'] + house_data['ScreenPorch']

<IPython.core.display.Javascript object>

In [14]:
#Total bathrooms = Full bath + 0.5*Halfbath (for both house and basement)
house_data['TotBaths'] = house_data['FullBath'] + house_data['BsmtFullBath'] + 0.5*house_data['HalfBath'] + 0.5*house_data['BsmtHalfBath']

<IPython.core.display.Javascript object>

In [15]:
#Total number of car garage = 1, 2, 3+ cars
house_data['TotCarGarage'] = pd.Series(len(house_data['GarageCars']), index=house_data.index)
house_data['TotCarGarage'] = 0 
house_data.loc[house_data['GarageCars'] >= 3,'TotCarGarage'] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


<IPython.core.display.Javascript object>

In [16]:
#Has Second Floor = 0 (no) or 1 (yes)
house_data['Has2ndFl'] = pd.Series(len(house_data['2ndFlrSF']), index=house_data.index)
house_data['Has2ndFl'] = 0 
house_data.loc[house_data['2ndFlrSF'] > 0,'Has2ndFl'] = 1


<IPython.core.display.Javascript object>

In [18]:
#Has Porch = 0 (no) or 1 (yes)
house_data['HasPorch'] = pd.Series(len(house_data['TotPorchSF']), index=house_data.index)
house_data['HasPorch'] = 0 
house_data.loc[house_data['TotPorchSF'] > 0,'HasPorch'] = 1


<IPython.core.display.Javascript object>

In [19]:
#Has Deck = 0 (no) or 1 (yes)
house_data['HasDeck'] = pd.Series(len(house_data['WoodDeckSF']), index=house_data.index)
house_data['HasDeck'] = 0 
house_data.loc[house_data['WoodDeckSF'] > 0,'HasDeck'] = 1

<IPython.core.display.Javascript object>

In [20]:
#Has Pool = 0 (no) or 1 (yes)
house_data['HasPool'] = pd.Series(len(house_data['PoolArea']), index=house_data.index)
house_data['HasPool'] = 0 
house_data.loc[house_data['PoolArea'] > 0,'HasPool'] = 1

<IPython.core.display.Javascript object>

In [21]:
#Has Fireplace = 0 (no), 1 (yes, not excellent quality), or 2 (excellent quality)
house_data['HasFirePlace'] = pd.Series(len(house_data['FireplaceQu']), index=house_data.index)
house_data['HasFirePlace'] = 1 #default all houses have a fireplace 
house_data.loc[house_data['FireplaceQu'].isna(),'HasFirePlace'] = 0 #replace 1 with 0 for 'No Fireplace'
house_data.loc[house_data['FireplaceQu']== 'Ex','HasFirePlace'] = 2 #replace 1 with 2 for 'Excellent Fireplace'

<IPython.core.display.Javascript object>

In [22]:
#Has Gas Heating = 0 (no) or 1 (yes, GasA or GasW)
house_data['HasGasHeating'] = pd.Series(len(house_data['Heating']), index=house_data.index)
house_data['HasGasHeating'] = 0 
house_data.loc[house_data['Heating']== 'GasA','HasGasHeating'] = 1
house_data.loc[house_data['Heating']== 'GasW','HasGasHeating'] = 1

<IPython.core.display.Javascript object>

In [24]:
#Has Modern Circuitbreaker = 0 (no) or 1 (yes)
house_data['HasSBrkr'] = pd.Series(len(house_data['Electrical']), index=house_data.index)
house_data['HasSBrkr'] = 0 
house_data.loc[house_data['Electrical']== 'SBrkr','HasSBrkr'] = 1

<IPython.core.display.Javascript object>

In [25]:
#Has excellent basement quality = 0 (no) or 1 (yes)
house_data['HasExBsmtQual'] = pd.Series(len(house_data['BsmtQual']), index=house_data.index)
house_data['HasExBsmtQual'] = 0 
house_data.loc[house_data['BsmtQual']== 'Ex','HasExBsmtQual'] = 1

<IPython.core.display.Javascript object>

In [26]:
#Has Basement Exposure = 0 (no) or 1 (yes)
house_data['HasGd'] = pd.Series(len(house_data['BsmtExposure']), index=house_data.index)
house_data['HasGd'] = 0 
house_data.loc[house_data['BsmtExposure']== 'Gd','HasGd'] = 1

<IPython.core.display.Javascript object>

In [27]:
#Has Basement with good living quarters = 0 (no) or 1 (yes)
house_data['HasGLQ'] = pd.Series(len(house_data['BsmtFinType1']), index=house_data.index)
house_data['HasGLQ'] = 0 
house_data.loc[house_data['BsmtFinType1']== 'GLQ','HasGLQ'] = 1

<IPython.core.display.Javascript object>

In [28]:
#Has paved driveway = 0 (no) or 1 (yes)
house_data['HasPavedDrive'] = pd.Series(len(house_data['PavedDrive']), index=house_data.index)
house_data['HasPavedDrive'] = 0 
house_data.loc[house_data['PavedDrive']== 'Y','HasPavedDrive'] = 1

<IPython.core.display.Javascript object>

In [None]:
house_data['HasTA'] = pd.Series(len(house_data['GarageQual']), index=house_data.index)
house_data['HasTA'] = 0 
house_data.loc[house_data['GarageQual']== 'TA','HasTA'] = 1

## Take stock of columns 

In [None]:
# house_data has all of the columns
# cat_var_all is subset 
# drop_cat_var_all

In [2]:
cat_var_all= ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
              'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 
              'PoolQC', 'Fence', 'MiscFeature','SaleType', 'SaleCondition', 'SalePrice_log', 'SalePrice']

drop_cat_var_all= ['Heating','Electrical', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'PavedDrive', 'GarageQual']
cat_only_drop=['Fence', 'MiscFeature', 'SaleType']
ordinal_drop=['BsmtFinType2','Functional', 'PoolQC', 'GarageCond', 'SaleCondition']




## One hot encode 

In [None]:
'''
def Diff(li1, li2): 
    li_dif = [i for i in li1 + li2 if i not in li1 or i not in li2] 
    return li_dif 

train_data_cat_var_final= []
cols= Diff(list(train_data_cat_var_final.columns),['SalePrice_log', 'SalePrice'])

house_data_subset = pd.get_dummies(house_data[cols], prefix=cols, columns=cols, drop_first=True)

## Remove Outliers

In [30]:
# # Separate predictors by numerical and categorical features
# from functions_file import categorize

# cat_features, num_cont_features, num_disc_features = categorize(house_data)

# # Determine Outliers of Continuous Data
# def outlier_idx(data, thresh):

#     avg = data.mean()  # calc average
#     stdev = data.std()  # calc standard deviation
#     z_score = (data - avg) / stdev  # calc z_score
#     ol = z_score > thresh  # boolean (True=outlier)
#     outlier_idx = data.index[ol]  # indexes of outliers

#     return list(outlier_idx)


# # Plot Outliers
# plt.figure(figsize=(15, 20))
# for idx, col in enumerate(num_cont_features):
#     outliers = outlier_idx(house_data[col], 6)  # outlier indices
#     plt.subplot(math.ceil(len(num_cont_features) / ncol), ncol, idx + 1)
#     plt.scatter(x=house_data[col], y=house_data["SalePrice_log"], c="blue")
#     plt.scatter(
#         x=house_data[col][outliers], y=house_data["SalePrice_log"][outliers], c="red"
#     )
#     plt.ylabel("SalePrice_log")
#     plt.xlabel(col)

NameError: name 'plt' is not defined

<IPython.core.display.Javascript object>

In [None]:
# vars_outliers = 'LotFrontage', 'LotArea', 'MasVnrArea', 'TotalBsmtSF', 'GrLivArea'

# outlier_store = {}
# thresh = 6  # outlier > +6std away from mean

# for idx, col in enumerate(num_cont_features):
#     outliers = outlier_idx(house_data[col], thresh)  # outlier indices

#     if len(outliers) > 0:
#         outlier_store[col] = outliers  # only store features with outliers

## Remove Variables

## Consolidate Data Feature Engineered: Train_test

In [42]:
house_data = pd.read_csv('csv')
house_data.shape
house_data2= pd.read_csv('train1_kc.csv')
house_data2.shap

(1460, 23)

In [136]:
jon_rich_test_feat= pd.read_csv('jon_rich_test_feature.csv')
kailun_test_feat= pd.read_csv('test1_kc.csv')

In [138]:
kailun_test_feat.head()

Unnamed: 0.1,Unnamed: 0,HasGasHeating,HasSBrkr,HasEx,HasGd,HasGLQ,HasPavedDrive,HasTA,BsmtCond_Gd,BsmtCond_None,BsmtCond_Po,BsmtCond_TA,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_Po,HeatingQC_TA,CentralAir_Y,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_None,FireplaceQu_Po,FireplaceQu_TA,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,GarageFinish_None,GarageFinish_RFn,GarageFinish_Unf
0,0,1,1,0,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1
1,1,1,1,0,0,0,1,1,0,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1
2,2,1,1,0,0,1,1,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0
3,3,1,1,0,0,1,1,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
4,4,1,1,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0


In [111]:
jon_rich_test_feat= jon_rich_test_feat.drop('Unnamed: 0', axis=1)
kailun_test_feat= kailun_test_feat.drop('Unnamed: 0', axis=1)

In [46]:
# This is only for training data 
Id= np.array(house_data['Id'])-1
house_data['Id']= Id
house_data= house_data.drop(['SalePrice', 'SalePrice_log'], axis=1)

In [53]:
train_all=pd.merge(house_data, house_data2, how= 'left', left_on= 'Id', right_on= house_data2.index)

In [139]:
test_feature=pd.merge(jon_rich_test_feat, kailun_test_feat, how= 'left', left_on= jon_rich_test_feat.index, right_on= kailun_test_feat.index)

In [43]:
house_data.columns

Index(['Unnamed: 0', 'Id', 'MSZoning', 'Street', 'Alley', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual',
       'ExterCond', 'Foundation'],
      dtype='object')

In [58]:
train_all=train_all.drop(['Unnamed: 0_y', 'Unnamed: 0_x'], axis=1)

In [62]:
test_feature.to_csv('train_all.csv')

In [134]:
test_feature= test_feature.drop('key_0', axis=1)

In [142]:
test_feature=test_feature.drop('key_0', axis=1)
test_feature.head()

Unnamed: 0,Unnamed: 0_x,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,ExterQual,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,GrLivArea,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageArea,GarageYrBlt,MoSold,WoodDeckSF,MiscVal,TotalSF,Age,TotPorchSF,TotBaths,TotCarGarage,Has2ndFl,HasPorch,HasDeck,HasPool,HasIrregularLotShape,HasLandSlope,FoundationType,HasStoneMas,NeighborhoodType,Unnamed: 0_y,HasGasHeating,HasSBrkr,HasEx,HasGd,HasGLQ,HasPavedDrive,HasTA,BsmtCond_Gd,BsmtCond_None,BsmtCond_Po,BsmtCond_TA,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_Po,HeatingQC_TA,CentralAir_Y,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_None,FireplaceQu_Po,FireplaceQu_TA,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,GarageFinish_None,GarageFinish_RFn,GarageFinish_Unf
0,0,1461,20,80.0,11622,5,6,0.0,3,468,270,882,896,896,2,1,5,0,730,1961,6,140,0,1778,49,120,1.0,0,0,1,1,0,0,0,1,0,1,0,1,1,0,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1
1,1,1462,20,81.0,14267,6,6,108.0,3,923,406,1329,1329,1329,3,1,6,0,312,1958,6,393,12500,2658,52,36,1.5,0,0,1,1,0,1,0,1,0,1,1,1,1,0,0,0,1,1,0,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1
2,2,1463,60,74.0,13830,5,5,0.0,3,791,137,928,928,1629,3,1,6,1,482,1997,3,212,0,2557,13,34,2.5,0,1,1,1,0,1,0,2,0,1,2,1,1,0,0,1,1,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0
3,3,1464,60,78.0,9978,6,6,20.0,3,602,324,926,926,1604,3,1,7,1,470,1998,6,360,0,2530,12,36,2.5,0,1,1,1,0,1,0,2,0,1,3,1,1,0,0,1,1,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
4,4,1465,120,43.0,5005,8,5,0.0,4,263,1017,1280,1280,1280,2,1,5,0,506,1992,1,0,0,2560,18,226,2.0,0,0,1,0,0,1,0,2,0,2,4,1,1,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0


In [144]:
test_feature=test_feature.drop('Unnamed: 0_x', axis=1)
test_feature.head()
test_feature=test_feature.drop('Unnamed: 0_y', axis=1)
test_feature.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,ExterQual,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,GrLivArea,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageArea,GarageYrBlt,MoSold,WoodDeckSF,MiscVal,TotalSF,Age,TotPorchSF,TotBaths,TotCarGarage,Has2ndFl,HasPorch,HasDeck,HasPool,HasIrregularLotShape,HasLandSlope,FoundationType,HasStoneMas,NeighborhoodType,HasGasHeating,HasSBrkr,HasEx,HasGd,HasGLQ,HasPavedDrive,HasTA,BsmtCond_Gd,BsmtCond_None,BsmtCond_Po,BsmtCond_TA,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_Po,HeatingQC_TA,CentralAir_Y,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_None,FireplaceQu_Po,FireplaceQu_TA,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,GarageFinish_None,GarageFinish_RFn,GarageFinish_Unf
0,1461,20,80.0,11622,5,6,0.0,3,468,270,882,896,896,2,1,5,0,730,1961,6,140,0,1778,49,120,1.0,0,0,1,1,0,0,0,1,0,1,1,1,0,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1
1,1462,20,81.0,14267,6,6,108.0,3,923,406,1329,1329,1329,3,1,6,0,312,1958,6,393,12500,2658,52,36,1.5,0,0,1,1,0,1,0,1,0,1,1,1,0,0,0,1,1,0,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1
2,1463,60,74.0,13830,5,5,0.0,3,791,137,928,928,1629,3,1,6,1,482,1997,3,212,0,2557,13,34,2.5,0,1,1,1,0,1,0,2,0,1,1,1,0,0,1,1,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0
3,1464,60,78.0,9978,6,6,20.0,3,602,324,926,926,1604,3,1,7,1,470,1998,6,360,0,2530,12,36,2.5,0,1,1,1,0,1,0,2,0,1,1,1,0,0,1,1,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
4,1465,120,43.0,5005,8,5,0.0,4,263,1017,1280,1280,1280,2,1,5,0,506,1992,1,0,0,2560,18,226,2.0,0,0,1,0,0,1,0,2,0,2,1,1,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0


In [None]:
cont_var_JH= ['Id','LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF',
'1stFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageYrBlt', 'GarageArea', 'WoodDeckSF',
'MiscVal','TotalSF', 'Age', 'TotPorchSF', 'TotBaths', 'MoSold', 'TotCarGarage']

cat_var_JH= ['Has2ndFl','NeighborhoodType','HasPorch', 'HasDeck', 'HasPool'] 

In [145]:
test_feature.to_csv('test_feature.csv')

## Consolidated DATA  Train_ Test 

In [45]:
jon= pd.read_csv('jon_original_HouseData.csv')
rich = pd.read_csv('rk_hotencode.csv')
kailun= pd.read_csv('train_all0.csv')

In [90]:
jon_test= pd.read_csv('jon_test_HouseData.csv')
rich_test = pd.read_csv('rk_hotencode_test.csv')
kailun_test= pd.read_csv('test_all0.csv')

In [122]:
# Examine Data 
kailun_test

Unnamed: 0,Id,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,HeatingQC,KitchenQual,Functional,FireplaceQu,GarageFinish,GarageCond,Fence,HasGasHeating,Electrical_combined,GarageType_combined,GarageQual_combined,Garage_combined,SaleType_combined,HasPavedDrive,HasPavedDrive_combined,SaleCond_combined,HasCentralAir
0,0,4,4,2,4,3,3,3,8,1,2,4,4,1,1,1,2,3.0,1,0,1.0,1,1
1,1,4,4,2,6,2,3,4,8,1,2,4,1,1,1,1,2,3.0,1,0,1.0,1,1
2,2,5,4,2,7,2,4,3,8,4,4,4,4,1,1,1,2,3.0,1,0,1.0,1,1
3,3,4,4,2,7,2,5,4,8,5,4,4,1,1,1,1,2,3.0,1,0,1.0,1,1
4,4,5,4,2,6,2,5,4,8,1,3,4,1,1,1,1,2,3.0,1,0,1.0,1,1
5,5,5,4,2,2,2,4,3,8,4,4,4,1,1,1,1,2,3.0,1,0,1.0,1,1
6,6,5,4,2,6,2,5,3,8,1,4,4,5,1,1,1,2,3.0,1,0,1.0,1,1
7,7,5,4,2,2,2,4,3,8,5,4,4,1,1,1,1,2,3.0,1,0,1.0,1,1
8,8,5,4,5,7,2,4,4,8,2,2,4,1,1,1,1,2,3.0,1,0,1.0,1,1
9,9,4,4,2,6,4,3,3,8,1,4,4,4,1,1,1,2,3.0,1,0,1.0,1,1


In [98]:
rich_test= rich_test.drop('Unnamed: 0', axis=1)
jon_test= jon_test.drop('Unnamed: 0', axis=1)

In [77]:
# Note this is not needed for Test data set since there are no outliers 
Id= np.array(jon['Id'])-1
jon['Id']= Id
#house_data= house_data.drop(['SalePrice', 'SalePrice_log'], axis=1)
jon= jon.drop(['Unnamed: 0'], axis=1)

In [63]:
train_all_cat=pd.merge(kailun, rich, how= 'left', left_on= 'Id', right_on= rich.index)

In [101]:
test_all_cat=pd.merge(jon_test, rich_test, how= 'left', left_on='Id', right_on= 'Id' )

In [65]:
drop=['Id_x', 'Id_y', 'Unnamed: 0', ]
train_all_cat= train_all_cat.drop('Unnamed: 0', axis=1)

In [79]:
train_all_final= pd.merge(jon, train_all_cat, how='left', left_on= 'Id', right_on= 'Id')

In [127]:
test_all_final=pd.merge(test_all_cat, kailun_test, how= 'left', left_on= test_all_cat.index, right_on= kailun_test.index)

In [86]:
drop=['SalePrice_x','SalePrice_log_x']
train_all_final=train_all_final.drop(drop, axis=1)

In [128]:
#test_all_final.head()
drop=[ 'Id_y', 'key_0' ]
test_all_final= test_all_final.drop(drop, axis=1)

In [None]:
cat_var_RK= ['Alley','BldgType','Exterior1st', 'Condition1', 'Condition2', 'Exterior2nd', 'Foundation', 'HouseStyle', 
             'LandContour','LotConfig', 'MasVnrType', 'MSSubClass', 'MSZoning', 'RoofMatl', 'Street', 'Utilities']
ord_var_RK= ['ExterCond', 'ExterQual', 'LandSlope', 'LotShape']

In [None]:
cat_var_KC= ['HasGasHeating', 'Electrical_combined', 'CentralAir_Y', 'GarageType_Combined', 
             'GarageQual_combined','HasPavedDrive', 'SaleType_combined', 'SalesCond_Combined']
ord_var_KC= ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
            'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish', 'GarageCond', 'Fence']

In [None]:
cat_var_JH= ['MoSold','YrSold', 'GarageYrBlt']

In [88]:
train_all_final.to_csv('train_all_final.csv')

In [130]:
test_all_final.to_csv('test_all_final.csv')

In [129]:
test_all_final

Unnamed: 0,Id_x,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,HeatingQC,KitchenQual,Functional,FireplaceQu,GarageFinish,GarageCond,Fence,HasGasHeating,Electrical_combined,GarageType_combined,GarageQual_combined,Garage_combined,SaleType_combined,HasPavedDrive,HasPavedDrive_combined,SaleCond_combined,HasCentralAir
0,1461,80.0,11622,5,6,1961,1961,0.0,468,144,270,882,896,0,0,896,0,0,1,0,2,1,5,0,1961,1,730,140,0,0,0,120,0,0,6,2010,3.0,2,3,1,4,1.0,5,1,13,2,3,1,3,2,1,11.0,14.0,3,3,3,2,4,4,2,4,3,3,3,8,1,2,4,4,1,1,1,2,3.0,1,0,1.0,1,1
1,1462,81.0,14267,6,6,1958,1958,108.0,923,0,406,1329,1329,0,0,1329,0,0,1,1,3,1,6,0,1958,1,312,393,36,0,0,0,0,12500,6,2010,4.0,2,3,2,4,1.0,1,1,13,3,3,1,3,4,1,12.0,15.0,2,3,3,2,4,4,2,6,2,3,4,8,1,2,4,1,1,1,1,2,3.0,1,0,1.0,1,1
2,1463,74.0,13830,5,5,1997,1998,0.0,791,0,137,928,928,701,0,1629,0,0,2,1,3,1,6,1,1997,2,482,212,34,0,0,0,0,0,3,2010,4.0,2,3,2,4,1.0,5,1,9,3,3,1,5,2,1,11.0,14.0,3,3,3,3,5,4,2,7,2,4,3,8,4,4,4,4,1,1,1,2,3.0,1,0,1.0,1,1
3,1464,78.0,9978,6,6,1998,1998,20.0,602,0,324,926,926,678,0,1604,0,0,2,1,3,1,7,1,1998,2,470,360,36,0,0,0,0,0,6,2010,4.0,2,3,2,4,1.0,5,1,9,3,3,1,5,2,1,11.0,14.0,2,3,3,3,4,4,2,7,2,5,4,8,5,4,4,1,1,1,1,2,3.0,1,0,1.0,1,1
4,1465,43.0,5005,8,5,1992,1992,0.0,263,0,1017,1280,1280,0,0,1280,0,0,2,0,2,1,5,0,1992,2,506,0,82,0,0,144,0,0,1,2010,4.0,2,3,2,2,1.0,5,1,22,3,3,5,3,2,1,7.0,8.0,3,4,3,3,5,4,2,6,2,5,4,8,1,3,4,1,1,1,1,2,3.0,1,0,1.0,1,1
5,1466,75.0,10000,6,5,1993,1994,0.0,0,0,763,763,763,892,0,1655,0,0,2,1,3,1,7,1,1993,2,440,157,84,0,0,0,0,0,4,2010,4.0,2,3,2,4,1.0,1,1,9,3,3,1,5,2,1,7.0,8.0,3,3,3,3,5,4,2,2,2,4,3,8,4,4,4,1,1,1,1,2,3.0,1,0,1.0,1,1
6,1467,68.580357,7980,6,7,1992,2007,0.0,935,0,233,1168,1187,0,0,1187,1,0,2,0,3,1,6,0,1992,2,420,483,21,0,0,0,0,500,3,2010,4.0,2,3,2,4,1.0,5,1,9,3,3,1,3,2,1,7.0,8.0,3,3,4,3,5,4,2,6,2,5,3,8,1,4,4,5,1,1,1,2,3.0,1,0,1.0,1,1
7,1468,63.0,8402,6,5,1998,1998,0.0,0,0,789,789,789,676,0,1465,0,0,2,1,3,1,7,1,1998,2,393,0,75,0,0,0,0,0,5,2010,4.0,2,3,2,4,1.0,5,1,9,3,3,1,5,2,1,11.0,14.0,3,3,3,3,5,4,2,2,2,4,3,8,5,4,4,1,1,1,1,2,3.0,1,0,1.0,1,1
8,1469,85.0,10176,7,5,1990,1990,0.0,637,0,663,1300,1341,0,0,1341,1,0,1,1,2,1,5,1,1990,2,506,192,0,0,0,0,0,0,2,2010,4.0,2,3,1,4,1.0,5,1,9,3,3,1,3,2,1,7.0,8.0,3,3,3,3,5,4,5,7,2,4,4,8,2,2,4,1,1,1,1,2,3.0,1,0,1.0,1,1
9,1470,70.0,8400,4,5,1970,1970,0.0,804,78,0,882,882,0,0,882,1,0,1,0,2,1,4,0,1970,2,525,240,0,0,0,0,0,0,4,2010,4.0,2,3,1,4,1.0,1,1,13,3,3,1,3,2,1,9.0,11.0,3,3,3,2,4,4,2,6,4,3,3,8,1,4,4,4,1,1,1,2,3.0,1,0,1.0,1,1
