In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [4]:
data=pd.read_csv('train.csv', index_col='Id')
data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
X_full=data.drop('SalePrice', axis=1)
y=data.SalePrice

X_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [16]:
# drop columns with a lot missing values
X_full.drop(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1, inplace=True)

In [19]:
numerical_columns=[col for col in X_full.columns if X_full[col].dtype in ['int64', 'float64']]

categorical_columns=[col for col in X_full.columns if X_full[col].dtype=='object' and X_full[col].nunique()<=10]

print( numerical_columns, len(numerical_columns))
print( categorical_columns, len(categorical_columns))

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold'] 36
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition'] 35


In [21]:
categorical_columns_5=[col for col in X_full.columns if X_full[col].dtype=='object' and X_full[col].nunique()<=5]
print(categorical_columns_5, len(categorical_columns_5))

['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'BldgType', 'MasVnrType', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive'] 22


In [22]:
X_full[numerical_columns].isnull().sum()

MSSubClass         0
LotFrontage      259
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         8
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       81
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
dtype: int64

In [44]:
X=X_full[numerical_columns+categorical_columns].copy()
X

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
2,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,SBrkr,TA,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
3,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
4,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,SBrkr,Gd,Typ,Detchd,Unf,TA,TA,Y,WD,Abnorml
5,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,62.0,7917,6,5,1999,2000,0.0,0,0,...,SBrkr,TA,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
1457,20,85.0,13175,6,6,1978,1988,119.0,790,163,...,SBrkr,TA,Min1,Attchd,Unf,TA,TA,Y,WD,Normal
1458,70,66.0,9042,7,9,1941,2006,0.0,275,0,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
1459,20,68.0,9717,5,6,1950,1996,0.0,49,1029,...,FuseA,Gd,Typ,Attchd,Unf,TA,TA,Y,WD,Normal


In [40]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer()

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns_5)
    ])

In [77]:
from xgboost import XGBRegressor

In [47]:
y.isnull()

Id
1       False
2       False
3       False
4       False
5       False
        ...  
1456    False
1457    False
1458    False
1459    False
1460    False
Name: SalePrice, Length: 1460, dtype: bool

In [50]:
from sklearn.model_selection import train_test_split

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [80]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [54]:
X_train_full[numerical_columns]=numerical_transformer.fit_transform(X_train_full[numerical_columns])
X_valid_full[numerical_columns]=numerical_transformer.transform(X_valid_full[numerical_columns])
X_train_full[numerical_columns].isnull().sum()

MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
dtype: int64

In [56]:
cat_imputer=SimpleImputer(strategy='most_frequent')

X_train_full[categorical_columns]=cat_imputer.fit_transform(X_train_full[categorical_columns])
X_valid_full[categorical_columns]=cat_imputer.transform(X_valid_full[categorical_columns])
X_train_full[categorical_columns].isnull().sum()

MSZoning         0
Street           0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
RoofStyle        0
RoofMatl         0
MasVnrType       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinType2     0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
KitchenQual      0
Functional       0
GarageType       0
GarageFinish     0
GarageQual       0
GarageCond       0
PavedDrive       0
SaleType         0
SaleCondition    0
dtype: int64

In [72]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train_full[categorical_columns]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid_full[categorical_columns]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train_full.index
OH_cols_valid.index = X_valid_full.index

# Add one-hot encoded columns to numerical features
X_train = pd.concat([X_train_full[numerical_columns], OH_cols_train], axis=1)
X_valid = pd.concat([X_valid_full[numerical_columns], OH_cols_valid], axis=1)


In [74]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error

lr=LinearRegression()
lr.fit(X_train, y_train)

scores_lg=mean_squared_log_error(lr.predict(X_valid).clip(0,), y_valid)



print("MSLE scores:\n", scores_lg)

MSLE scores:
 1.0624642388404038




In [78]:
xbr=XGBRegressor()
xbr.fit(X_train, y_train)

scores_xbr=mean_squared_log_error(xbr.predict(X_valid).clip(0,), y_valid)



print("MSLE scores:\n", scores_xbr)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


MSLE scores:
 0.01918704814653112


In [81]:
rf=RandomForestRegressor()
rf.fit(X_train, y_train)

scores_rf=mean_squared_log_error(rf.predict(X_valid).clip(0,), y_valid)

print("MSLE scores:\n", scores_rf)



MSLE scores:
 0.01967861746487666




In [75]:
X_train

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,164,165,166,167,168,169,170,171,172,173
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,20.0,90.000000,11694.0,9.0,5.0,2007.0,2007.0,452.0,48.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
871,20.0,60.000000,6600.0,5.0,5.0,1962.0,1962.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
93,30.0,80.000000,13360.0,5.0,7.0,1921.0,2006.0,0.0,713.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
818,20.0,69.614017,13265.0,8.0,5.0,2002.0,2002.0,148.0,1218.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
303,20.0,118.000000,13704.0,7.0,5.0,2001.0,2002.0,150.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764,60.0,82.000000,9430.0,8.0,5.0,1999.0,1999.0,673.0,1163.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
836,20.0,60.000000,9600.0,4.0,7.0,1950.0,1995.0,0.0,442.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1217,90.0,68.000000,8930.0,6.0,5.0,1978.0,1978.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
560,120.0,69.614017,3196.0,7.0,5.0,2003.0,2004.0,18.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## X_test

In [83]:
X_test_data=pd.read_csv('test.csv', index_col='Id')

In [84]:
X_test=X_test_data.copy()

In [85]:
X_test[numerical_columns]=numerical_transformer.transform(X_test[numerical_columns])

X_test[categorical_columns]=cat_imputer.transform(X_test[categorical_columns])

In [86]:
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[categorical_columns]))

# One-hot encoding removed index; put it back
OH_cols_test.index = X_test.index

# Add one-hot encoded columns to numerical features
X_test_final = pd.concat([X_test[numerical_columns], OH_cols_test], axis=1)

In [88]:
predictions=xbr.predict(X_test_final).clip(0,)

output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': predictions})
output.to_csv('submission_1.csv', index=False)

In [90]:
# Check pipelines

In [89]:
OH_encoder.transform(X_test[categorical_columns])

array([[0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [94]:
X_check=X_train_full.copy()

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor.fit_transform(X_check)


array([[2.00000000e+01, 9.00000000e+01, 1.16940000e+04, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [2.00000000e+01, 6.00000000e+01, 6.60000000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [3.00000000e+01, 8.00000000e+01, 1.33600000e+04, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       ...,
       [9.00000000e+01, 6.80000000e+01, 8.93000000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [1.20000000e+02, 6.96140167e+01, 3.19600000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [6.00000000e+01, 5.80000000e+01, 1.67700000e+04, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00]])

In [95]:
model = RandomForestRegressor(n_estimators=100, random_state=0)
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train_full, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', SimpleImputer(),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCond', 'YearBuilt',
                                                   'YearRemodAdd', 'MasVnrArea',
                                                   'BsmtFinSF1', 'BsmtFinSF2',
                                                   'BsmtUnfSF', 'TotalBsmtSF',
                                                   '1stFlrSF', '2ndFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'BsmtFullBath',
                                                   'BsmtHalfBath', 'FullBath',
                                                   'HalfBath', 'Bed...
                                              

In [97]:
scores_rf_p=mean_squared_log_error(clf.predict( X_valid_full).clip(0,), y_valid)
scores_rf_p

0.02008707949496977

In [98]:
predictions_2_p=clf.predict(X_test).clip(0,)

output_2 = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': predictions_2_p})
output_2.to_csv('submission_2.csv', index=False)

In [99]:
model=XGBRegressor()
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train_full, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', SimpleImputer(),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCond', 'YearBuilt',
                                                   'YearRemodAdd', 'MasVnrArea',
                                                   'BsmtFinSF1', 'BsmtFinSF2',
                                                   'BsmtUnfSF', 'TotalBsmtSF',
                                                   '1stFlrSF', '2ndFlrSF',
                                                   'LowQualFinSF', 'GrLivArea',
                                                   'BsmtFullBath',
                                                   'BsmtHalfBath', 'FullBath',
                                                   'HalfBath', 'Bed...
                              gamma=0, gpu_id=

In [100]:
scores_rf_p=mean_squared_log_error(clf.predict( X_valid_full), y_valid)
scores_rf_p

0.018240281081462854

In [101]:
predictions_3=clf.predict(X_test)

output_3 = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': predictions_3})
output_3.to_csv('submission_3.csv', index=False)

In [106]:
scores_CVS=cross_val_score(clf, X_train_full, y_train, cv=5, scoring='neg_mean_squared_log_error')
scores_CVS.mean()

-0.021786616120222722

In [105]:
from sklearn import metrics
metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [115]:
list_scores=dict()
for n in [100, 200, 300, 400]:
    model=XGBRegressor(n_estimators=n)
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])
    scores_CVS=cross_val_score(clf, X_train_full, y_train, cv=5, scoring='neg_mean_squared_log_error')
    list_scores[n]=-scores_CVS.mean()

In [116]:
print(list_scores.items())

dict_items([(100, 0.021786616120222722), (200, 0.021789935134036), (300, 0.021793843487836518), (400, 0.021793866968673064)])


In [117]:
list_scores_2=dict()
for n in [100, 200, 300, 400]:
    model=RandomForestRegressor(n_estimators=n)
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])
    scores_CVS=cross_val_score(clf, X_train_full, y_train, cv=5, scoring='neg_mean_squared_log_error')
    list_scores_2[n]=-scores_CVS.mean()

print(list_scores_2.items())   

dict_items([(100, 0.02245411695880691), (200, 0.022075981356886192), (300, 0.021895939504768123), (400, 0.021958009963557415)])


In [118]:
model=LinearRegression()
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])
scores_CVS=cross_val_score(clf, X_train_full, y_train, cv=5, scoring='neg_mean_squared_log_error')
print(-scores_CVS.mean())  

0.026133622412669404


## Columns, I would choose

In [120]:
numerical_columns

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [123]:
num_col_del=['LotFrontage','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','2stFlrSF','LowQualFinSF','GrLivArea','GarageYrBlt','GarageArea']
numerical_columns_my=list(set(numerical_columns) - set(num_col_del))
numerical_columns_my

['KitchenAbvGr',
 'WoodDeckSF',
 '2ndFlrSF',
 'HalfBath',
 'PoolArea',
 '3SsnPorch',
 'YearBuilt',
 'MiscVal',
 'YrSold',
 'OverallQual',
 'GarageCars',
 'BedroomAbvGr',
 'ScreenPorch',
 'EnclosedPorch',
 'TotRmsAbvGrd',
 'YearRemodAdd',
 'LotArea',
 'BsmtHalfBath',
 'OverallCond',
 'BsmtFullBath',
 'FullBath',
 'Fireplaces',
 'MSSubClass',
 '1stFlrSF',
 'TotalBsmtSF',
 'OpenPorchSF',
 'MoSold']

In [124]:
categorical_columns_my=['MSZoning', 'Street', 'Utilities', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
                        'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
                        'BsmtExposure', 'Heating', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 
                        'GarageType', 'PavedDrive', 'SaleType', 'SaleCondition']

In [130]:
preprocessor_my = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns_my),
        ('cat', categorical_transformer, categorical_columns_my)
    ])
pypeline_my = Pipeline(steps=[('preprocessor', preprocessor_my),
                      ('model', model)
                     ])

scores_LR_my=cross_val_score(pypeline_my, X_train_full[numerical_columns_my + categorical_columns_my] , y_train, cv=5, scoring='neg_mean_squared_log_error')

0.026133622412669404


Traceback (most recent call last):
  File "/Users/nadiiaturbai/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/nadiiaturbai/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/Users/nadiiaturbai/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Users/nadiiaturbai/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_regression.py", line 521, in mean_squared_log_error
    raise ValueError(
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



In [132]:
print(-scores_LR_my.mean())  

nan


In [131]:
scores_LR_my

array([-0.02572242,         nan, -0.03781956, -0.02168647, -0.02097695])

In [133]:
model_xbr=XGBRegressor()
pypeline_my = Pipeline(steps=[('preprocessor', preprocessor_my),
                      ('model', model_xbr)
                     ])
scores_xbr_my=cross_val_score(pypeline_my, X_train_full[numerical_columns_my + categorical_columns_my], y_train, cv=5, scoring='neg_mean_squared_log_error')
print(-scores_xbr_my.mean())  

0.023621369624265382


In [134]:
scores_xbr_my

array([-0.01852831, -0.03330006, -0.02754032, -0.02002463, -0.01871353])

In [136]:
# My XBR for test data

pypeline_my.fit(X_train_full[numerical_columns_my + categorical_columns_my], y_train)
predictions_4=pypeline_my.predict(X_test[numerical_columns_my + categorical_columns_my]).clip(0,)

output_4 = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': predictions_4})
output_4.to_csv('submission_4.csv', index=False)

In [137]:
# submission_4 - result: 0.15 (the worst)