In [67]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder

/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/test.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/test.csv


In [68]:
df=pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
df_test=pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")

In [69]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [70]:
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [71]:
print(f"Train Shape: {df.shape}\nTest Shape: {df_test.shape}")

Train Shape: (1460, 81)
Test Shape: (1459, 80)


In [72]:
df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [73]:
df.dropna(axis=0, subset=['SalePrice'], inplace=True)
y_train = df.SalePrice
df.drop(['SalePrice'], axis=1, inplace=True)

In [74]:
df.info

<bound method DataFrame.info of         Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0        1          60       RL         65.0     8450   Pave   NaN      Reg   
1        2          20       RL         80.0     9600   Pave   NaN      Reg   
2        3          60       RL         68.0    11250   Pave   NaN      IR1   
3        4          70       RL         60.0     9550   Pave   NaN      IR1   
4        5          60       RL         84.0    14260   Pave   NaN      IR1   
...    ...         ...      ...          ...      ...    ...   ...      ...   
1455  1456          60       RL         62.0     7917   Pave   NaN      Reg   
1456  1457          20       RL         85.0    13175   Pave   NaN      Reg   
1457  1458          70       RL         66.0     9042   Pave   NaN      Reg   
1458  1459          20       RL         68.0     9717   Pave   NaN      Reg   
1459  1460          20       RL         75.0     9937   Pave   NaN      Reg   

     LandContour Ut

In [75]:
cat_cols = [col for col in df.columns if df[col].dtype=="O"]
num_cols = [col for col in df.columns if(df[col].dtype=="int64" or df[col].dtype=="float64")]

In [76]:
print(f"Total number of cols = cat_cols + num_cols = {len(cat_cols)+len(num_cols)}\nNumber of columns in df = {df.shape[1]}")

Total number of cols = cat_cols + num_cols = 80
Number of columns in df = 80


In [77]:
imputer_cat = SimpleImputer(strategy='most_frequent')
imputer_num = SimpleImputer(strategy='constant')

In [78]:
imputer_cat.fit(df[cat_cols])
imputer_num.fit(df[num_cols])


SimpleImputer(strategy='constant')

In [79]:
df_imputed_num = pd.DataFrame(imputer_num.transform(df[num_cols].copy()))
df_imputed_cat = pd.DataFrame(imputer_cat.transform(df[cat_cols].copy()))


In [80]:
df_imputed_num.columns = df[num_cols].columns
df_imputed_cat.columns = df[cat_cols].columns


In [81]:
df_imputed = pd.concat([df_imputed_num,df_imputed_cat], axis = 1)
assert df_imputed.shape == df.shape, "Shape mismatch"

In [82]:
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
enc.fit(df_imputed[cat_cols])
X_cat_transformed = pd.DataFrame(enc.transform(df_imputed[cat_cols]), columns=enc.get_feature_names(input_features=cat_cols))
df_final = df_imputed.merge(X_cat_transformed, left_index = True, right_index = True).drop(columns = cat_cols, axis=1)
df_final.shape

(1460, 289)

In [83]:
df, X_valid, y_train, y_valid = train_test_split(df_final, y_train, train_size=0.8, test_size=0.2,random_state=197)

In [84]:
df.shape

(1168, 289)

In [85]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 20)]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [int(x) for x in np.linspace(5, 30, num = 25)]
max_depth.append(None)
min_samples_split = [int(x) for x in np.linspace(2, 10, num = 8)]
min_samples_leaf = [int(x) for x in np.linspace(1, 5, num = 5)]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(df, y_train)
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   47.1s


In [None]:
model = RandomForestRegressor(n_estimators=1000, random_state=42, n_jobs=10, min_samples_split= 4, min_samples_leaf=1, max_features='sqrt', max_depth=24,bootstrap=False)
model.fit(df, y_train)

In [None]:
X_valid.shape

In [None]:
preds_valid = model.predict(X_valid)
print(f"MAE: {mean_absolute_error(y_valid, preds_valid)}")

In [None]:
df_test_imputed_num = pd.DataFrame(imputer_num.transform(df_test[num_cols]))
df_test_imputed_cat = pd.DataFrame(imputer_cat.transform(df_test[cat_cols]))

df_test_imputed_num.columns = df_test[num_cols].columns
df_test_imputed_cat.columns = df_test[cat_cols].columns

df_test_imputed = pd.concat([df_test_imputed_num,df_test_imputed_cat], axis = 1)
assert df_test_imputed.shape == df_test.shape, "Shape mismatch"

X_cat_transformed = pd.DataFrame(enc.transform(df_test_imputed[cat_cols]), columns=enc.get_feature_names(input_features=cat_cols))
df_test_final = df_test_imputed.merge(X_cat_transformed, left_index = True, right_index = True).drop(columns = cat_cols, axis=1)


In [None]:
print(f"Original shape: {df_test.shape}\nImputed shape: {df_test_imputed.shape}\nEncoded shape: {df_test_final.shape}")

In [None]:
preds_test = model.predict(df_test_final)

In [None]:
output = pd.DataFrame({'Id': df_test.index,'SalePrice': preds_test})
output.to_csv('/kaggle/input/home-data-for-ml-course/sample_submission.csv', index=False)