# Set Up

In [1]:
# Import helpful libraries
import os
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
# Print data directories
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


# Exploratory Data Analisys

In [3]:
# Load Data
iowa_file_path = '../input/house-prices-advanced-regression-techniques/train.csv'
home_data = pd.read_csv(iowa_file_path)
# Print the first five rows of home_data
home_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# Sumarizing Data
home_data_numeric = home_data.select_dtypes(include=['number'])
result = home_data_numeric.agg(["mean", "std"])
print(result)

              Id  MSSubClass  LotFrontage       LotArea  OverallQual  \
mean  730.500000   56.897260    70.049958  10516.828082     6.099315   
std   421.610009   42.300571    24.284752   9981.264932     1.382997   

      OverallCond    YearBuilt  YearRemodAdd  MasVnrArea  BsmtFinSF1  ...  \
mean     5.575342  1971.267808   1984.865753  103.685262  443.639726  ...   
std      1.112799    30.202904     20.645407  181.066207  456.098091  ...   

      WoodDeckSF  OpenPorchSF  EnclosedPorch  3SsnPorch  ScreenPorch  \
mean   94.244521    46.660274      21.954110   3.409589    15.060959   
std   125.338794    66.256028      61.119149  29.317331    55.757415   

       PoolArea     MiscVal    MoSold       YrSold      SalePrice  
mean   2.758904   43.489041  6.321918  2007.815753  180921.195890  
std   40.177307  496.123024  2.703626     1.328095   79442.502883  

[2 rows x 38 columns]


## Data Cleaning and Imputation

In [5]:
# Count the number of missing values in each column
def count_missing_values(df):
    missing_values = df.isna().sum().sort_values(ascending=False)
    missing_values = missing_values[missing_values > 0]
    return missing_values

count_missing_values(home_data)

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
GarageQual        81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
dtype: int64

In [6]:
def deal_missing_values(df, threshold=0):
    # Find the five percent threshold
    threshold = len(df) * threshold
    
    # Create a filter
    cols_to_drop = df.columns[df.isna().sum() <= threshold]
    
    # Drop missing values for columns below the threshold
    df.dropna(subset=cols_to_drop, inplace=True)

    # imputing with sumary statistic for numeric cols
    cols_numeric_missing_values = df.select_dtypes(include=['number']).columns
    for col in cols_numeric_missing_values:    
        df[col] = df[col].fillna(df[col].median())

    # imputing with sumary statistic for categorical cols
    cols_categorical_missing_values = df.select_dtypes(include=['object']).columns
    for col in cols_categorical_missing_values:    
        df[col] = df[col].fillna(df[col].mode()[0])

In [7]:
# Deal with missing values
deal_missing_values(home_data, threshold=0.05)
count_missing_values(home_data)

Series([], dtype: int64)

# Create y

In [8]:
y = home_data.SalePrice

# Create X

In [9]:
features = home_data.select_dtypes(include=['number']).drop('SalePrice', axis=1).columns

# Select columns corresponding to features, and preview the data
X = home_data[features]
X.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,548,0,61,0,0,0,0,0,2,2008
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,460,298,0,0,0,0,0,0,5,2007
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,608,0,42,0,0,0,0,0,9,2008
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,642,0,35,272,0,0,0,0,2,2006
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,836,192,84,0,0,0,0,0,12,2008


# Split X and y into validation and training data

In [10]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Validation Model

In [11]:
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(train_X, train_y)

## Validate Model

In [12]:
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)
print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))

Validation MAE for Random Forest Model: 18,628


# Competition Model

In [13]:
# To improve accuracy, create a new Random Forest model which you will train on all training data
rf_model_on_full_data = RandomForestRegressor(random_state=1)

# fit rf_model_on_full_data on all data from the training data
rf_model_on_full_data.fit(X, y)

## Test Model

In [14]:
# path to file you will use for predictions
test_data_path = '../input/house-prices-advanced-regression-techniques/test.csv'

# read test data file using pandas
test_data = pd.read_csv(test_data_path)

# deal with missing values
deal_missing_values(test_data)

# create test_X which comes from test_data but includes only the columns you used for prediction.
# The list of columns is stored in a variable called features
test_X = test_data[features]

# make predictions which we will submit. 
test_preds = rf_model_on_full_data.predict(test_X)

## Generate a submission

In [15]:
# Run the code to save predictions in the format used for competition scoring

output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)