In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from sklearn import metrics
from sklearn.model_selection import cross_val_score

from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
from hyperopt.pyll.base import scope

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')

In [2]:
data = pd.read_csv("../input/home-data-for-ml-course/train.csv")
train = pd.DataFrame(data)
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
data = pd.read_csv("../input/home-data-for-ml-course/test.csv")
test = pd.DataFrame(data)
test.head()

In [3]:
y = train['SalePrice']
train.drop(["SalePrice", "Id"], axis =1, inplace = True)
test.drop(["Id"], axis =1, inplace = True)

In [4]:
print("Train shape before dropping features more than 80% missing values: ", train.shape)
mv = train.isnull().sum()/len(train)
train = train.drop(columns=mv[mv>0.8].index)
print("Train shape after dropping features more than 80% missing values: ", train.shape)

print("Train shape before dropping features more than 80% missing values: ", test.shape)
mv = test.isnull().sum()/len(test)
train = test.drop(columns=mv[mv>0.8].index)
print("Train shape after dropping features more than 80% missing values: ", test.shape)

Train shape before dropping features more than 80% missing values:  (1460, 79)
Train shape after dropping features more than 80% missing values:  (1460, 75)


In [5]:
for x in train.columns:
    #printing unique values
    print(x ,':', len(train[x].unique()))

MSSubClass : 15
MSZoning : 5
LotFrontage : 111
LotArea : 1073
Street : 2
LotShape : 4
LandContour : 4
Utilities : 2
LotConfig : 5
LandSlope : 3
Neighborhood : 25
Condition1 : 9
Condition2 : 8
BldgType : 5
HouseStyle : 8
OverallQual : 10
OverallCond : 9
YearBuilt : 112
YearRemodAdd : 61
RoofStyle : 6
RoofMatl : 8
Exterior1st : 15
Exterior2nd : 16
MasVnrType : 5
MasVnrArea : 328
ExterQual : 4
ExterCond : 5
Foundation : 6
BsmtQual : 5
BsmtCond : 5
BsmtExposure : 5
BsmtFinType1 : 7
BsmtFinSF1 : 637
BsmtFinType2 : 7
BsmtFinSF2 : 144
BsmtUnfSF : 780
TotalBsmtSF : 721
Heating : 6
HeatingQC : 5
CentralAir : 2
Electrical : 6
1stFlrSF : 753
2ndFlrSF : 417
LowQualFinSF : 24
GrLivArea : 861
BsmtFullBath : 4
BsmtHalfBath : 3
FullBath : 4
HalfBath : 3
BedroomAbvGr : 8
KitchenAbvGr : 4
KitchenQual : 4
TotRmsAbvGrd : 12
Functional : 7
Fireplaces : 4
FireplaceQu : 6
GarageType : 7
GarageYrBlt : 98
GarageFinish : 4
GarageCars : 5
GarageArea : 441
GarageQual : 6
GarageCond : 6
PavedDrive : 3
WoodDeckSF :

In [15]:
# Filtering numerical data
num_df = train.select_dtypes(include=np.number)
print(num_df.shape)

# Filtering categorical data
cat_df = train.select_dtypes(exclude=np.number)
print(cat_df.shape)

(1460, 36)
(1460, 39)


In [None]:
# Filtering numerical data
num_df_test = test.select_dtypes(include=np.number)
print(num_df.shape)

# Filtering categorical data
cat_df_test = test.select_dtypes(exclude=np.number)
print(cat_df_test.shape)

In [16]:
for x in cat_df.columns:
    #printing unique values
    print(x ,':', len(cat_df[x].unique()))

MSZoning : 5
Street : 2
LotShape : 4
LandContour : 4
Utilities : 2
LotConfig : 5
LandSlope : 3
Neighborhood : 25
Condition1 : 9
Condition2 : 8
BldgType : 5
HouseStyle : 8
RoofStyle : 6
RoofMatl : 8
Exterior1st : 15
Exterior2nd : 16
MasVnrType : 5
ExterQual : 4
ExterCond : 5
Foundation : 6
BsmtQual : 5
BsmtCond : 5
BsmtExposure : 5
BsmtFinType1 : 7
BsmtFinType2 : 7
Heating : 6
HeatingQC : 5
CentralAir : 2
Electrical : 6
KitchenQual : 4
Functional : 7
FireplaceQu : 6
GarageType : 7
GarageFinish : 4
GarageQual : 6
GarageCond : 6
PavedDrive : 3
SaleType : 9
SaleCondition : 6


In [17]:

# Filling missing values by median for numerical columns 
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
num_df = pd.DataFrame(imp_median.fit_transform(num_df), columns=num_df.columns)
print(num_df.shape)

# Filling missing values by most frequent value for categorical columns
for i in cat_df.columns:
    cat_df[i] = cat_df[i].fillna(cat_df[i].mode()[0])
    
cat_df.isnull().sum()
num_df.isnull().sum()

(1460, 36)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == "__main__":


MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
dtype: int64

In [33]:
scaled_num = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(num_df), columns = num_df.columns)
#optimuadd_suffixm number of components
pca = PCA().fit(scaled_num)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel("number of components")
plt.ylabel("Cumulative Rate of Variance")

pca = PCA(n_components = 0.95)

pca_fit = pca.fit_transform(scaled_num)
num_pca = pd.DataFrame(data = pca_fit)
pca.explained_variance_ratio_.sum()
del num_df, scaled_num

NameError: name 'num_df' is not defined

In [23]:
num_pca.shape

(1460, 29)

In [30]:


cat_df = pd.get_dummies(cat_df)
cat_df.head()

clf = ExtraTreesClassifier(n_estimators=150, criterion = 'entropy')
clf = clf.fit(cat_df, y)

model = SelectFromModel(clf, prefit=True)
feature_idx = model.get_support()
feature_name = cat_df.columns[feature_idx]

cat_new = pd.DataFrame(model.transform(cat_df), columns = feature_name)
cat_new.head()

  f"X has feature names, but {self.__class__.__name__} was fitted without"


Unnamed: 0,MSZoning_RL,MSZoning_RM,LotShape_IR1,LotShape_Reg,LandContour_Lvl,LotConfig_Corner,LotConfig_Inside,Neighborhood_CollgCr,Neighborhood_NAmes,Condition1_Norm,...,GarageType_Attchd,GarageType_Detchd,GarageFinish_Fin,GarageFinish_RFn,GarageFinish_Unf,PavedDrive_Y,SaleType_New,SaleType_WD,SaleCondition_Normal,SaleCondition_Partial
0,1,0,0,1,1,0,1,1,0,1,...,1,0,0,1,0,1,0,1,1,0
1,1,0,0,1,1,0,0,0,0,0,...,1,0,0,1,0,1,0,1,1,0
2,1,0,1,0,1,0,1,1,0,1,...,1,0,0,1,0,1,0,1,1,0
3,1,0,1,0,1,1,0,0,0,1,...,0,1,0,0,1,1,0,1,0,0
4,1,0,1,0,1,0,0,0,0,1,...,1,0,0,1,0,1,0,1,1,0


In [32]:
# Concatinating numerical and categorical data
train = pd.concat([y, num_pca, cat_df], axis=1)
train = pd.DataFrame(train)

# Verifying missing values
print(f'Total missing values: {train.isnull().sum().sum()}')
print(train.shape)
train.head()

Total missing values: 0
(1460, 269)


Unnamed: 0,SalePrice,0,1,2,3,4,5,6,7,8,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,208500,0.492427,0.118482,0.208697,-0.282859,0.342976,-0.177677,-0.139861,-0.16174,-0.273673,...,0,0,0,1,0,0,0,0,1,0
1,181500,-0.121454,-0.116552,-0.260003,0.199992,0.146006,-0.061426,-0.076182,0.154779,-0.005555,...,0,0,0,1,0,0,0,0,1,0
2,223500,0.535528,-0.036311,0.172966,-0.062405,-0.063315,0.266637,-0.099067,-0.162771,-0.001253,...,0,0,0,1,0,0,0,0,1,0
3,140000,-0.168763,-0.410712,0.01677,0.080357,0.431293,-0.23483,0.129264,0.27369,-0.182482,...,0,0,0,1,1,0,0,0,0,0
4,250000,0.752436,-0.16415,0.267805,0.203389,-0.212172,0.437468,0.012147,-0.152838,-0.196279,...,0,0,0,1,0,0,0,0,1,0


In [42]:

X = train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [43]:


clf = RandomForestRegressor()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

7724.976539625532


In [44]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [57]:



space = {
    "bootstrap": hp.choice("bootstrap", [True, False]),
    "n_estimators": hp.choice("n_estimators", [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]),
    "max_depth": hp.choice("max_depth", [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]),
    "criterion": hp.choice("criterion", ['squared_error', 'absolute_error', 'poisson']),
    "max_features": hp.choice("max_features", ['auto', 'sqrt']),
    "min_samples_leaf": hp.choice ("min_samples_leaf", [1, 2, 4]),
    "min_samples_split": hp.choice("min_samples_split", [2, 4, 6, 8, 10])
}

def hyperparameter_tuning(params):
    rf = RandomForestRegressor(**params,n_jobs=-1, verbose = 0)
    rf.fit(X_train, y_train)
    y_scores = rf.predict(X_test)
    sqrt= np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    return {"loss": -sqrt, "status": STATUS_OK}


trials = Trials()

best = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=20, 
    trials=trials
)

print("Best: {}".format(best))

100%|██████████| 20/20 [09:16<00:00, 27.81s/trial, best loss: -7724.976539625532]
Best: {'bootstrap': 1, 'criterion': 0, 'max_depth': 7, 'max_features': 1, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 3}


In [58]:
rf = RandomForestRegressor(bootstrap = False,
                            criterion = "squared_error",
                            max_depth= 80,
                            n_estimators= 800,
                            min_samples_split = 8,
                            min_samples_leaf = 2,
                            max_features= 'sqrt')

rf.fit(X_train, y_train)
y_scores = rf.predict(X_test)
sqrt= np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print(sqrt)

7724.976539625532
