In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

In [2]:
def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

In [3]:
df = pd.read_csv("C:../../Dataset/FECourseData/ames.csv")
X = df.copy()
y = X.pop("SalePrice")

In [8]:
X_1 = pd.DataFrame()

X_1["LivLotRatio"] = X.GrLivArea / X.LotArea
X_1["Spaciousness"] = (X.FirstFlrSF + X.SecondFlrSF) / X.TotRmsAbvGrd
X_1["TotalOutsideSF"] = X.WoodDeckSF + X.OpenPorchSF + X.EnclosedPorch + X.Threeseasonporch + X.ScreenPorch

In [7]:
X.FirstFlrSF + (X.SecondFlrSF/X.TotRmsAbvGrd)

0       1656.000000
1        896.000000
2       1329.000000
3       2110.000000
4       1044.833333
           ...     
2925    1003.000000
2926     902.000000
2927     970.000000
2928    1389.000000
2929    1107.555556
Length: 2930, dtype: float64

In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 78 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   MSSubClass        2930 non-null   object 
 1   MSZoning          2930 non-null   object 
 2   LotFrontage       2930 non-null   float64
 3   LotArea           2930 non-null   float64
 4   Street            2930 non-null   object 
 5   Alley             2930 non-null   object 
 6   LotShape          2930 non-null   object 
 7   LandContour       2930 non-null   object 
 8   Utilities         2930 non-null   object 
 9   LotConfig         2930 non-null   object 
 10  LandSlope         2930 non-null   object 
 11  Neighborhood      2930 non-null   object 
 12  Condition1        2930 non-null   object 
 13  Condition2        2930 non-null   object 
 14  BldgType          2930 non-null   object 
 15  HouseStyle        2930 non-null   object 
 16  OverallQual       2930 non-null   object 


In [12]:
X_2 = pd.get_dummies(X.BldgType, prefix='Bldg')
X_2 = X_2.mul(X.GrLivArea, axis=0)

In [17]:
t = X['WoodDeckSF'].count(X.WoodDeckSF > 0)

  t = X['WoodDeckSF'].count(X.WoodDeckSF > 0)


ValueError: Series.count level is only valid with a MultiIndex

In [18]:
filtered_WoodDeckSF = X.WoodDeckSF > 0

In [23]:
filtered_WoodDeckSF.value_counts()

False    1526
True     1404
Name: WoodDeckSF, dtype: int64

In [25]:
X_3 = pd.DataFrame()

X_3["PorchTypes"] = df[[
    "WoodDeckSF",
    "OpenPorchSF",
    "EnclosedPorch",
    "Threeseasonporch",
    "ScreenPorch",
]].gt(0.0).sum(axis=1)

In [26]:
X_3

Unnamed: 0,PorchTypes
0,2
1,2
2,2
3,0
4,2
...,...
2925,1
2926,1
2927,2
2928,2


In [27]:
X

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YearSold,SaleType,SaleCondition
0,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,141.0,31770.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,0.0,0.0,No_Pool,No_Fence,,0.0,5,2010,WD,Normal
1,One_Story_1946_and_Newer_All_Styles,Residential_High_Density,80.0,11622.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,120.0,0.0,No_Pool,Minimum_Privacy,,0.0,6,2010,WD,Normal
2,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,81.0,14267.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,0.0,0.0,No_Pool,No_Fence,Gar2,12500.0,6,2010,WD,Normal
3,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,93.0,11160.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Corner,...,0.0,0.0,No_Pool,No_Fence,,0.0,4,2010,WD,Normal
4,Two_Story_1946_and_Newer,Residential_Low_Density,74.0,13830.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,0.0,0.0,No_Pool,Minimum_Privacy,,0.0,3,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,Split_or_Multilevel,Residential_Low_Density,37.0,7937.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,CulDSac,...,0.0,0.0,No_Pool,Good_Privacy,,0.0,3,2006,WD,Normal
2926,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,0.0,8885.0,Pave,No_Alley_Access,Slightly_Irregular,Low,AllPub,Inside,...,0.0,0.0,No_Pool,Minimum_Privacy,,0.0,6,2006,WD,Normal
2927,Split_Foyer,Residential_Low_Density,62.0,10441.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,0.0,0.0,No_Pool,Minimum_Privacy,Shed,700.0,7,2006,WD,Normal
2928,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,77.0,10010.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,0.0,0.0,No_Pool,No_Fence,,0.0,4,2006,WD,Normal


In [30]:
X_4 = pd.DataFrame()

X_4["MSClass"] = X.MSSubClass.str.split('_', n=1, expand=True)[0]

In [31]:
X_4

Unnamed: 0,MSClass
0,One
1,One
2,One
3,One
4,Two
...,...
2925,Split
2926,One
2927,Split
2928,One


In [32]:
X_5 = pd.DataFrame()

X_5['MedNhbdArea'] = X.groupby('Neighborhood')['GrLivArea'].transform('median') 

In [33]:
X_new = X.join([X_1, X_2, X_3, X_4, X_5])

In [34]:
score_dataset(X_new, y)



0.13847331622461567