In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
house = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')

In [None]:
house.head()

In [None]:
int_cols = list(house.describe().columns)
int_cols

In [None]:
house[int_cols].isnull().sum()

# imputation
The method for filling in for missing values
* impute by mean or mode values
* impute with zeros or negitive entries to indicate a missing value
* impute by backfil/frontfill/kde
* Impute with ML

In [None]:
# Assumption, the missing values indicate that there is no object to record
house.LotFrontage = house.LotFrontage.fillna(0)
house.GarageYrBlt = house.GarageYrBlt.fillna(0)
house.MasVnrArea = house.MasVnrArea.fillna(0)
house.LotFrontage.isnull().sum()

In [None]:
house.columns

In [None]:
house[['Electrical', 'SaleType', 'SaleCondition']].isnull().sum()

# Label Encoding

In [None]:
house['Electrical'] = house['Electrical'].fillna('other')
house['Electrical'].value_counts()
Electrical = {'SBrkr':1,
    'FuseA':2,
    'FuseF':3,
    'FuseP':4,
    'Mix':5,
    'other':-1}

In [None]:
house['SaleType'].value_counts()
SaleType = {'WD':1,
            'New':2,
            'COD':3,
            'ConLD':4,
            'ConLw':5, 
            'ConLI':6,
            'CWD':7,
            'Oth':8,
            'Con':9,
           'other':-1}

In [None]:
house['SaleCondition'].value_counts()
SaleCondition = {'Normal':1,
            'Partial':2,
            'Abnorml':3,
            'Family':4,
            'Alloca':5,
            'AdjLand':6,
            'other':-1}

In [None]:
house['Electrical'] = pd.Series([Electrical[x] for x in house['Electrical']], index=house.index)
house['SaleCondition'] = pd.Series([SaleCondition[x] for x in house['SaleCondition']], index=house.index)
house['SaleType'] = pd.Series([SaleType[x] for x in house['SaleType']], index=house.index)

In [None]:
int_cols = list(house.describe().columns)
house = house[int_cols]

In [None]:
house.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split

X = house.drop('SalePrice', axis=1)
y = house.SalePrice

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
dcr = DecisionTreeRegressor(max_depth=8, min_samples_split=20, min_samples_leaf=25, 
                            min_weight_fraction_leaf=0.0, max_features=15,
                            max_leaf_nodes=None, min_impurity_decrease=0.0, random_state=42)
dcr.fit(X_train, y_train)
print('Score on training data: ',dcr.score(X_train, y_train))
print('Score on validation data: ', dcr.score(X_test, y_test), end='\n\n')

print('MAE for training data is: ', mean_absolute_error(dcr.predict(X_train), y_train))
print('MAE for validation data is: ', mean_absolute_error(dcr.predict(X_test),y_test))

In [None]:
training = []
testing = []
l = list(range(20,50,1))
for i in l:
    dcr = DecisionTreeRegressor(max_depth=6, min_samples_split=i, min_samples_leaf=10, 
                                max_leaf_nodes=40, min_impurity_decrease=0.0, random_state=42)
    dcr.fit(X_train, y_train)

    training += [mean_absolute_error(dcr.predict(X_train), y_train)]
    testing += [mean_absolute_error(dcr.predict(X_test),y_test)]
    


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
sns.lineplot(y=training, x=l)
sns.lineplot(y=testing, x=l);

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
print('Score on training data: ',model.score(X_train, y_train))
print('Score on validation data: ', model.score(X_test, y_test), end='\n\n')

print('MAE for training data is: ', mean_absolute_error(model.predict(X_train), y_train))
print('MAE for validation data is: ', mean_absolute_error(model.predict(X_test),y_test))