# Machine Learning Housing Competition
#### By: MaryGrace Kane

### Ingesting Data

In [97]:
import pandas as pd
import numpy as np
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


### Identifying Columns to be used in Dataset

In [127]:
numeric_columns = [c for c in train_data.columns if train_data[c].dtypes != object]
full_columns_train = [c for c in train_data.columns if train_data[c].isnull().sum() == 0]
full_columns_test = [c for c in test_data.columns if test_data[c].isnull().sum() == 0]
clean_columns = [c for c in full_columns_train if (c in full_columns_test) & (c in numeric_columns)]

['Id',
 'MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

### Creating X and y & Cleaning Data

In [132]:
X = train_data[clean_columns]
y = train_data.SalePrice

#### Citation for _get_numeric_data() : https://stackoverflow.com/questions/25039626/how-do-i-find-numeric-columns-in-pandas

In [133]:
X_test_data = test_data[clean_columns]

### Splitting X and y

In [134]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

### DecisionTreeRegressor

In [135]:
from sklearn.tree import DecisionTreeRegressor

housing_model = DecisionTreeRegressor(random_state=1)
housing_model.fit(train_X, train_y)

val_predictions = housing_model.predict(val_X)

In [136]:
from sklearn.metrics import mean_absolute_error
val_mae = mean_absolute_error(val_y, val_predictions)
print(val_mae)

25198.854794520546


### RandomForestRegressor

In [137]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(random_state = 1)
rf_model.fit(train_X, train_y)

rf_val_predictions = rf_model.predict(val_X)

In [138]:
rf_val_mae = mean_absolute_error(val_y, rf_val_predictions)
print(rf_val_mae)

17707.70087671233


### Fitting to Full Data with Best Model

In [139]:
rf_model.fit(X, y)

RandomForestRegressor(random_state=1)

### Predicting using test_data

In [140]:
test_predictions = rf_model.predict(X_test_data)

### Create Output Dataframe

In [141]:
output = pd.DataFrame({'Id': X_test_data.Id, 'SalePrice': test_predictions})
output.to_csv('submission2.csv', index = False)